pawan2411 commited on
Commit
820bae1
·
verified ·
1 Parent(s): 617116e

MPCOT v4: Deep Panini + 10-language CoT SFT adapter

Browse files
Files changed (42) hide show
  1. .gitattributes +4 -0
  2. README.md +62 -0
  3. adapter_config.json +46 -0
  4. adapter_model.safetensors +3 -0
  5. chat_template.jinja +54 -0
  6. checkpoint-1400/README.md +209 -0
  7. checkpoint-1400/adapter_config.json +46 -0
  8. checkpoint-1400/adapter_model.safetensors +3 -0
  9. checkpoint-1400/chat_template.jinja +54 -0
  10. checkpoint-1400/optimizer.pt +3 -0
  11. checkpoint-1400/rng_state.pth +3 -0
  12. checkpoint-1400/scheduler.pt +3 -0
  13. checkpoint-1400/tokenizer.json +3 -0
  14. checkpoint-1400/tokenizer_config.json +30 -0
  15. checkpoint-1400/trainer_state.json +671 -0
  16. checkpoint-1400/training_args.bin +3 -0
  17. checkpoint-1600/README.md +209 -0
  18. checkpoint-1600/adapter_config.json +46 -0
  19. checkpoint-1600/adapter_model.safetensors +3 -0
  20. checkpoint-1600/chat_template.jinja +54 -0
  21. checkpoint-1600/optimizer.pt +3 -0
  22. checkpoint-1600/rng_state.pth +3 -0
  23. checkpoint-1600/scheduler.pt +3 -0
  24. checkpoint-1600/tokenizer.json +3 -0
  25. checkpoint-1600/tokenizer_config.json +30 -0
  26. checkpoint-1600/trainer_state.json +762 -0
  27. checkpoint-1600/training_args.bin +3 -0
  28. checkpoint-1683/README.md +209 -0
  29. checkpoint-1683/adapter_config.json +46 -0
  30. checkpoint-1683/adapter_model.safetensors +3 -0
  31. checkpoint-1683/chat_template.jinja +54 -0
  32. checkpoint-1683/optimizer.pt +3 -0
  33. checkpoint-1683/rng_state.pth +3 -0
  34. checkpoint-1683/scheduler.pt +3 -0
  35. checkpoint-1683/tokenizer.json +3 -0
  36. checkpoint-1683/tokenizer_config.json +30 -0
  37. checkpoint-1683/trainer_state.json +792 -0
  38. checkpoint-1683/training_args.bin +3 -0
  39. tokenizer.json +3 -0
  40. tokenizer_config.json +30 -0
  41. training_args.bin +3 -0
  42. training_config.json +11 -0
.gitattributes CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ checkpoint-1400/tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ checkpoint-1600/tokenizer.json filter=lfs diff=lfs merge=lfs -text
38
+ checkpoint-1683/tokenizer.json filter=lfs diff=lfs merge=lfs -text
39
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen2.5-7B-Instruct
3
+ library_name: peft
4
+ model_name: mpcot_qwen7b_lora
5
+ tags:
6
+ - base_model:adapter:Qwen/Qwen2.5-7B-Instruct
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ licence: license
12
+ pipeline_tag: text-generation
13
+ ---
14
+
15
+ # Model Card for mpcot_qwen7b_lora
16
+
17
+ This model is a fine-tuned version of [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct).
18
+ It has been trained using [TRL](https://github.com/huggingface/trl).
19
+
20
+ ## Quick start
21
+
22
+ ```python
23
+ from transformers import pipeline
24
+
25
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
26
+ generator = pipeline("text-generation", model="None", device="cuda")
27
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
28
+ print(output["generated_text"])
29
+ ```
30
+
31
+ ## Training procedure
32
+
33
+
34
+
35
+
36
+
37
+ This model was trained with SFT.
38
+
39
+ ### Framework versions
40
+
41
+ - PEFT 0.18.1
42
+ - TRL: 0.29.0
43
+ - Transformers: 5.0.0
44
+ - Pytorch: 2.10.0+cu128
45
+ - Datasets: 4.0.0
46
+ - Tokenizers: 0.22.2
47
+
48
+ ## Citations
49
+
50
+
51
+
52
+ Cite TRL as:
53
+
54
+ ```bibtex
55
+ @software{vonwerra2020trl,
56
+ title = {{TRL: Transformers Reinforcement Learning}},
57
+ author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin},
58
+ license = {Apache-2.0},
59
+ url = {https://github.com/huggingface/trl},
60
+ year = {2020}
61
+ }
62
+ ```
adapter_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 128,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 64,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "v_proj",
33
+ "k_proj",
34
+ "gate_proj",
35
+ "down_proj",
36
+ "up_proj",
37
+ "q_proj",
38
+ "o_proj"
39
+ ],
40
+ "target_parameters": null,
41
+ "task_type": "CAUSAL_LM",
42
+ "trainable_token_indices": null,
43
+ "use_dora": false,
44
+ "use_qalora": false,
45
+ "use_rslora": false
46
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be15156e4206b36e88697f973f0757a6eb2e18abadf49ac66348796353b26c7c
3
+ size 645975704
chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
checkpoint-1400/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen2.5-7B-Instruct
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:Qwen/Qwen2.5-7B-Instruct
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.18.1
checkpoint-1400/adapter_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 128,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 64,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "v_proj",
33
+ "k_proj",
34
+ "gate_proj",
35
+ "down_proj",
36
+ "up_proj",
37
+ "q_proj",
38
+ "o_proj"
39
+ ],
40
+ "target_parameters": null,
41
+ "task_type": "CAUSAL_LM",
42
+ "trainable_token_indices": null,
43
+ "use_dora": false,
44
+ "use_qalora": false,
45
+ "use_rslora": false
46
+ }
checkpoint-1400/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d49570902725151e1772a09d041780e1df02c9296722ec152853c17c967c6ef
3
+ size 645975704
checkpoint-1400/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
checkpoint-1400/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f36b3e08fde39f1a70a3e960eb7d829189e8fd455fb30671eee91cfab4048829
3
+ size 1292182139
checkpoint-1400/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c74bff3a7c4bc281b33b0b8e11d8123d149fa629cd264735e3d1419cd7b1386
3
+ size 14645
checkpoint-1400/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a57b1a48d66b223ec979eed79f59f1aeaab5c15f1823e44893266234805bbea6
3
+ size 1465
checkpoint-1400/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8
3
+ size 11421892
checkpoint-1400/tokenizer_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "backend": "tokenizers",
4
+ "bos_token": null,
5
+ "clean_up_tokenization_spaces": false,
6
+ "eos_token": "<|im_end|>",
7
+ "errors": "replace",
8
+ "extra_special_tokens": [
9
+ "<|im_start|>",
10
+ "<|im_end|>",
11
+ "<|object_ref_start|>",
12
+ "<|object_ref_end|>",
13
+ "<|box_start|>",
14
+ "<|box_end|>",
15
+ "<|quad_start|>",
16
+ "<|quad_end|>",
17
+ "<|vision_start|>",
18
+ "<|vision_end|>",
19
+ "<|vision_pad|>",
20
+ "<|image_pad|>",
21
+ "<|video_pad|>"
22
+ ],
23
+ "is_local": false,
24
+ "model_max_length": 131072,
25
+ "pad_token": "<|endoftext|>",
26
+ "padding_side": "right",
27
+ "split_special_tokens": false,
28
+ "tokenizer_class": "Qwen2Tokenizer",
29
+ "unk_token": null
30
+ }
checkpoint-1400/trainer_state.json ADDED
@@ -0,0 +1,671 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 2.4955436720142603,
6
+ "eval_steps": 200,
7
+ "global_step": 1400,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "entropy": 1.1273316520452499,
14
+ "epoch": 0.044563279857397504,
15
+ "grad_norm": 2.0025203227996826,
16
+ "learning_rate": 3.6e-06,
17
+ "loss": 2.2611521911621093,
18
+ "mean_token_accuracy": 0.6291543507575988,
19
+ "num_tokens": 363359.0,
20
+ "step": 25
21
+ },
22
+ {
23
+ "entropy": 1.391269074678421,
24
+ "epoch": 0.08912655971479501,
25
+ "grad_norm": 1.0109221935272217,
26
+ "learning_rate": 7.35e-06,
27
+ "loss": 1.767060546875,
28
+ "mean_token_accuracy": 0.6594330656528473,
29
+ "num_tokens": 724311.0,
30
+ "step": 50
31
+ },
32
+ {
33
+ "entropy": 1.2697400665283203,
34
+ "epoch": 0.13368983957219252,
35
+ "grad_norm": 0.4787505269050598,
36
+ "learning_rate": 1.11e-05,
37
+ "loss": 1.231834716796875,
38
+ "mean_token_accuracy": 0.7449960750341416,
39
+ "num_tokens": 1089859.0,
40
+ "step": 75
41
+ },
42
+ {
43
+ "entropy": 1.0446020710468291,
44
+ "epoch": 0.17825311942959002,
45
+ "grad_norm": 0.39569053053855896,
46
+ "learning_rate": 1.485e-05,
47
+ "loss": 1.0368045043945313,
48
+ "mean_token_accuracy": 0.7771743559837341,
49
+ "num_tokens": 1456575.0,
50
+ "step": 100
51
+ },
52
+ {
53
+ "entropy": 0.9694650781154632,
54
+ "epoch": 0.22281639928698752,
55
+ "grad_norm": 0.44794389605522156,
56
+ "learning_rate": 1.4991494309781894e-05,
57
+ "loss": 0.9510629272460938,
58
+ "mean_token_accuracy": 0.7904590421915054,
59
+ "num_tokens": 1819729.0,
60
+ "step": 125
61
+ },
62
+ {
63
+ "entropy": 0.9067935299873352,
64
+ "epoch": 0.26737967914438504,
65
+ "grad_norm": 0.49256861209869385,
66
+ "learning_rate": 1.4964566090257208e-05,
67
+ "loss": 0.8909156036376953,
68
+ "mean_token_accuracy": 0.8001276826858521,
69
+ "num_tokens": 2185895.0,
70
+ "step": 150
71
+ },
72
+ {
73
+ "entropy": 0.8762328952550889,
74
+ "epoch": 0.31194295900178254,
75
+ "grad_norm": 0.48932692408561707,
76
+ "learning_rate": 1.4919266844792835e-05,
77
+ "loss": 0.8628057098388672,
78
+ "mean_token_accuracy": 0.8043822544813156,
79
+ "num_tokens": 2554889.0,
80
+ "step": 175
81
+ },
82
+ {
83
+ "entropy": 0.8572803306579589,
84
+ "epoch": 0.35650623885918004,
85
+ "grad_norm": 0.5422897338867188,
86
+ "learning_rate": 1.485570805925459e-05,
87
+ "loss": 0.8397312927246093,
88
+ "mean_token_accuracy": 0.8085139858722686,
89
+ "num_tokens": 2920719.0,
90
+ "step": 200
91
+ },
92
+ {
93
+ "epoch": 0.35650623885918004,
94
+ "eval_entropy": 0.8323967654705048,
95
+ "eval_loss": 0.8291334509849548,
96
+ "eval_mean_token_accuracy": 0.8096436858177185,
97
+ "eval_num_tokens": 2920719.0,
98
+ "eval_runtime": 74.8901,
99
+ "eval_samples_per_second": 13.313,
100
+ "eval_steps_per_second": 3.338,
101
+ "step": 200
102
+ },
103
+ {
104
+ "entropy": 0.8403333276510239,
105
+ "epoch": 0.40106951871657753,
106
+ "grad_norm": 0.5801687240600586,
107
+ "learning_rate": 1.4774046158019147e-05,
108
+ "loss": 0.8235167694091797,
109
+ "mean_token_accuracy": 0.8098820477724076,
110
+ "num_tokens": 3288435.0,
111
+ "step": 225
112
+ },
113
+ {
114
+ "entropy": 0.817181087732315,
115
+ "epoch": 0.44563279857397503,
116
+ "grad_norm": 0.603880763053894,
117
+ "learning_rate": 1.467448211899838e-05,
118
+ "loss": 0.799793701171875,
119
+ "mean_token_accuracy": 0.8144050502777099,
120
+ "num_tokens": 3654202.0,
121
+ "step": 250
122
+ },
123
+ {
124
+ "entropy": 0.8009092861413956,
125
+ "epoch": 0.49019607843137253,
126
+ "grad_norm": 0.5765889883041382,
127
+ "learning_rate": 1.4557260979013106e-05,
128
+ "loss": 0.7848175811767578,
129
+ "mean_token_accuracy": 0.8172187548875809,
130
+ "num_tokens": 4016287.0,
131
+ "step": 275
132
+ },
133
+ {
134
+ "entropy": 0.8024452942609787,
135
+ "epoch": 0.5347593582887701,
136
+ "grad_norm": 0.7014256715774536,
137
+ "learning_rate": 1.4422671230733536e-05,
138
+ "loss": 0.7894332122802734,
139
+ "mean_token_accuracy": 0.8166103160381317,
140
+ "num_tokens": 4379803.0,
141
+ "step": 300
142
+ },
143
+ {
144
+ "entropy": 0.7904212397336959,
145
+ "epoch": 0.5793226381461676,
146
+ "grad_norm": 0.6543148756027222,
147
+ "learning_rate": 1.4271044112670647e-05,
148
+ "loss": 0.7738318634033203,
149
+ "mean_token_accuracy": 0.8181957858800888,
150
+ "num_tokens": 4748127.0,
151
+ "step": 325
152
+ },
153
+ {
154
+ "entropy": 0.7665162217617035,
155
+ "epoch": 0.6238859180035651,
156
+ "grad_norm": 0.7135517001152039,
157
+ "learning_rate": 1.410275279396588e-05,
158
+ "loss": 0.7533625030517578,
159
+ "mean_token_accuracy": 0.8217650431394578,
160
+ "num_tokens": 5113040.0,
161
+ "step": 350
162
+ },
163
+ {
164
+ "entropy": 0.7557546135783195,
165
+ "epoch": 0.6684491978609626,
166
+ "grad_norm": 0.6762207746505737,
167
+ "learning_rate": 1.3918211455985435e-05,
168
+ "loss": 0.7417732238769531,
169
+ "mean_token_accuracy": 0.8234544372558594,
170
+ "num_tokens": 5477938.0,
171
+ "step": 375
172
+ },
173
+ {
174
+ "entropy": 0.7480651473999024,
175
+ "epoch": 0.7130124777183601,
176
+ "grad_norm": 0.6766519546508789,
177
+ "learning_rate": 1.3717874272979488e-05,
178
+ "loss": 0.7343754577636719,
179
+ "mean_token_accuracy": 0.8247038215398789,
180
+ "num_tokens": 5846777.0,
181
+ "step": 400
182
+ },
183
+ {
184
+ "epoch": 0.7130124777183601,
185
+ "eval_entropy": 0.7494170541763305,
186
+ "eval_loss": 0.7359814047813416,
187
+ "eval_mean_token_accuracy": 0.8247466235160827,
188
+ "eval_num_tokens": 5846777.0,
189
+ "eval_runtime": 74.7841,
190
+ "eval_samples_per_second": 13.332,
191
+ "eval_steps_per_second": 3.343,
192
+ "step": 400
193
+ },
194
+ {
195
+ "entropy": 0.7388822847604751,
196
+ "epoch": 0.7575757575757576,
197
+ "grad_norm": 0.7576785683631897,
198
+ "learning_rate": 1.350223429431504e-05,
199
+ "loss": 0.7303533935546875,
200
+ "mean_token_accuracy": 0.82606416285038,
201
+ "num_tokens": 6212618.0,
202
+ "step": 425
203
+ },
204
+ {
205
+ "entropy": 0.7430421102046967,
206
+ "epoch": 0.8021390374331551,
207
+ "grad_norm": 0.7369253635406494,
208
+ "learning_rate": 1.3271822231033263e-05,
209
+ "loss": 0.7292195129394531,
210
+ "mean_token_accuracy": 0.8252584689855575,
211
+ "num_tokens": 6578764.0,
212
+ "step": 450
213
+ },
214
+ {
215
+ "entropy": 0.7350365900993348,
216
+ "epoch": 0.8467023172905526,
217
+ "grad_norm": 0.7027698755264282,
218
+ "learning_rate": 1.3027205149717825e-05,
219
+ "loss": 0.7203064727783203,
220
+ "mean_token_accuracy": 0.8271685636043549,
221
+ "num_tokens": 6940517.0,
222
+ "step": 475
223
+ },
224
+ {
225
+ "entropy": 0.7169802790880203,
226
+ "epoch": 0.8912655971479501,
227
+ "grad_norm": 0.7340224981307983,
228
+ "learning_rate": 1.276898507688866e-05,
229
+ "loss": 0.705379867553711,
230
+ "mean_token_accuracy": 0.8299148625135422,
231
+ "num_tokens": 7306466.0,
232
+ "step": 500
233
+ },
234
+ {
235
+ "entropy": 0.7128468745946884,
236
+ "epoch": 0.9358288770053476,
237
+ "grad_norm": 0.7902767658233643,
238
+ "learning_rate": 1.2497797517355924e-05,
239
+ "loss": 0.6976683807373046,
240
+ "mean_token_accuracy": 0.8309504073858262,
241
+ "num_tokens": 7675590.0,
242
+ "step": 525
243
+ },
244
+ {
245
+ "entropy": 0.7067722028493881,
246
+ "epoch": 0.9803921568627451,
247
+ "grad_norm": 0.7943085432052612,
248
+ "learning_rate": 1.2214309890180613e-05,
249
+ "loss": 0.6949668884277344,
250
+ "mean_token_accuracy": 0.8305781084299088,
251
+ "num_tokens": 8042404.0,
252
+ "step": 550
253
+ },
254
+ {
255
+ "entropy": 0.695909548997879,
256
+ "epoch": 1.0249554367201426,
257
+ "grad_norm": 0.7510514259338379,
258
+ "learning_rate": 1.191921988609109e-05,
259
+ "loss": 0.6792121124267578,
260
+ "mean_token_accuracy": 0.8343433332443237,
261
+ "num_tokens": 8403933.0,
262
+ "step": 575
263
+ },
264
+ {
265
+ "entropy": 0.6738390463590622,
266
+ "epoch": 1.0695187165775402,
267
+ "grad_norm": 0.8021165132522583,
268
+ "learning_rate": 1.1613253750398085e-05,
269
+ "loss": 0.6603101348876953,
270
+ "mean_token_accuracy": 0.8382544696331025,
271
+ "num_tokens": 8772072.0,
272
+ "step": 600
273
+ },
274
+ {
275
+ "epoch": 1.0695187165775402,
276
+ "eval_entropy": 0.6920017371177674,
277
+ "eval_loss": 0.6961521506309509,
278
+ "eval_mean_token_accuracy": 0.8314581851959228,
279
+ "eval_num_tokens": 8772072.0,
280
+ "eval_runtime": 74.8097,
281
+ "eval_samples_per_second": 13.327,
282
+ "eval_steps_per_second": 3.342,
283
+ "step": 600
284
+ },
285
+ {
286
+ "entropy": 0.6920944279432297,
287
+ "epoch": 1.1140819964349375,
288
+ "grad_norm": 0.8023701310157776,
289
+ "learning_rate": 1.1297164495634069e-05,
290
+ "loss": 0.6772218322753907,
291
+ "mean_token_accuracy": 0.8343758553266525,
292
+ "num_tokens": 9137160.0,
293
+ "step": 625
294
+ },
295
+ {
296
+ "entropy": 0.67285136282444,
297
+ "epoch": 1.1586452762923352,
298
+ "grad_norm": 0.7788256406784058,
299
+ "learning_rate": 1.0971730048315917e-05,
300
+ "loss": 0.6581203460693359,
301
+ "mean_token_accuracy": 0.8390156370401383,
302
+ "num_tokens": 9505580.0,
303
+ "step": 650
304
+ },
305
+ {
306
+ "entropy": 0.6888180702924729,
307
+ "epoch": 1.2032085561497325,
308
+ "grad_norm": 0.8268939256668091,
309
+ "learning_rate": 1.0637751334391775e-05,
310
+ "loss": 0.673553466796875,
311
+ "mean_token_accuracy": 0.8359775388240814,
312
+ "num_tokens": 9868570.0,
313
+ "step": 675
314
+ },
315
+ {
316
+ "entropy": 0.6915264892578125,
317
+ "epoch": 1.2477718360071302,
318
+ "grad_norm": 0.8361654877662659,
319
+ "learning_rate": 1.0296050308084114e-05,
320
+ "loss": 0.6790201568603516,
321
+ "mean_token_accuracy": 0.8342142343521118,
322
+ "num_tokens": 10229373.0,
323
+ "step": 700
324
+ },
325
+ {
326
+ "entropy": 0.6885707491636276,
327
+ "epoch": 1.2923351158645278,
328
+ "grad_norm": 0.7386716604232788,
329
+ "learning_rate": 9.94746792898014e-06,
330
+ "loss": 0.6720596313476562,
331
+ "mean_token_accuracy": 0.8353542894124985,
332
+ "num_tokens": 10595419.0,
333
+ "step": 725
334
+ },
335
+ {
336
+ "entropy": 0.6660267195105553,
337
+ "epoch": 1.3368983957219251,
338
+ "grad_norm": 0.7973800897598267,
339
+ "learning_rate": 9.59286209234813e-06,
340
+ "loss": 0.6550118255615235,
341
+ "mean_token_accuracy": 0.8386269718408584,
342
+ "num_tokens": 10960517.0,
343
+ "step": 750
344
+ },
345
+ {
346
+ "entropy": 0.6469692060351372,
347
+ "epoch": 1.3814616755793225,
348
+ "grad_norm": 0.798152506351471,
349
+ "learning_rate": 9.233105517773445e-06,
350
+ "loss": 0.6308420181274415,
351
+ "mean_token_accuracy": 0.8429271316528321,
352
+ "num_tokens": 11328702.0,
353
+ "step": 775
354
+ },
355
+ {
356
+ "entropy": 0.6708013540506363,
357
+ "epoch": 1.4260249554367201,
358
+ "grad_norm": 0.9537823796272278,
359
+ "learning_rate": 8.869083601310398e-06,
360
+ "loss": 0.6537622833251953,
361
+ "mean_token_accuracy": 0.838316883444786,
362
+ "num_tokens": 11697546.0,
363
+ "step": 800
364
+ },
365
+ {
366
+ "epoch": 1.4260249554367201,
367
+ "eval_entropy": 0.670824561715126,
368
+ "eval_loss": 0.6723578572273254,
369
+ "eval_mean_token_accuracy": 0.8353032109737396,
370
+ "eval_num_tokens": 11697546.0,
371
+ "eval_runtime": 74.7664,
372
+ "eval_samples_per_second": 13.335,
373
+ "eval_steps_per_second": 3.344,
374
+ "step": 800
375
+ },
376
+ {
377
+ "entropy": 0.659270493388176,
378
+ "epoch": 1.4705882352941178,
379
+ "grad_norm": 0.846034586429596,
380
+ "learning_rate": 8.501692236436132e-06,
381
+ "loss": 0.6444293212890625,
382
+ "mean_token_accuracy": 0.8404667204618455,
383
+ "num_tokens": 12061827.0,
384
+ "step": 825
385
+ },
386
+ {
387
+ "entropy": 0.6627422112226486,
388
+ "epoch": 1.5151515151515151,
389
+ "grad_norm": 0.9181033968925476,
390
+ "learning_rate": 8.131835609169295e-06,
391
+ "loss": 0.6494012451171876,
392
+ "mean_token_accuracy": 0.839583694934845,
393
+ "num_tokens": 12427853.0,
394
+ "step": 850
395
+ },
396
+ {
397
+ "entropy": 0.6641036707162857,
398
+ "epoch": 1.5597147950089125,
399
+ "grad_norm": 0.858001172542572,
400
+ "learning_rate": 7.760423972779985e-06,
401
+ "loss": 0.6495742797851562,
402
+ "mean_token_accuracy": 0.8395592844486237,
403
+ "num_tokens": 12799973.0,
404
+ "step": 875
405
+ },
406
+ {
407
+ "entropy": 0.6689085793495179,
408
+ "epoch": 1.6042780748663101,
409
+ "grad_norm": 0.8615349531173706,
410
+ "learning_rate": 7.388371407567565e-06,
411
+ "loss": 0.6532559967041016,
412
+ "mean_token_accuracy": 0.8388407498598098,
413
+ "num_tokens": 13166796.0,
414
+ "step": 900
415
+ },
416
+ {
417
+ "entropy": 0.6729245400428772,
418
+ "epoch": 1.6488413547237077,
419
+ "grad_norm": 0.831142783164978,
420
+ "learning_rate": 7.01659357121981e-06,
421
+ "loss": 0.6572090911865235,
422
+ "mean_token_accuracy": 0.8372052818536758,
423
+ "num_tokens": 13532499.0,
424
+ "step": 925
425
+ },
426
+ {
427
+ "entropy": 0.6538485777378082,
428
+ "epoch": 1.6934046345811051,
429
+ "grad_norm": 0.919346809387207,
430
+ "learning_rate": 6.6460054452899315e-06,
431
+ "loss": 0.6404708862304688,
432
+ "mean_token_accuracy": 0.8411308795213699,
433
+ "num_tokens": 13898404.0,
434
+ "step": 950
435
+ },
436
+ {
437
+ "entropy": 0.6691750481724739,
438
+ "epoch": 1.7379679144385025,
439
+ "grad_norm": 0.9280221462249756,
440
+ "learning_rate": 6.277519083337656e-06,
441
+ "loss": 0.6546466827392579,
442
+ "mean_token_accuracy": 0.838825848698616,
443
+ "num_tokens": 14261658.0,
444
+ "step": 975
445
+ },
446
+ {
447
+ "entropy": 0.6536609560251236,
448
+ "epoch": 1.7825311942959001,
449
+ "grad_norm": 0.9000495076179504,
450
+ "learning_rate": 5.9120413662763545e-06,
451
+ "loss": 0.6405950927734375,
452
+ "mean_token_accuracy": 0.8412596487998962,
453
+ "num_tokens": 14625008.0,
454
+ "step": 1000
455
+ },
456
+ {
457
+ "epoch": 1.7825311942959001,
458
+ "eval_entropy": 0.6716028243303299,
459
+ "eval_loss": 0.6561057567596436,
460
+ "eval_mean_token_accuracy": 0.8381222817897797,
461
+ "eval_num_tokens": 14625008.0,
462
+ "eval_runtime": 74.7617,
463
+ "eval_samples_per_second": 13.336,
464
+ "eval_steps_per_second": 3.344,
465
+ "step": 1000
466
+ },
467
+ {
468
+ "entropy": 0.6671841683983802,
469
+ "epoch": 1.8270944741532977,
470
+ "grad_norm": 0.8711400628089905,
471
+ "learning_rate": 5.550471770450572e-06,
472
+ "loss": 0.6500684356689453,
473
+ "mean_token_accuracy": 0.8389109486341476,
474
+ "num_tokens": 14985559.0,
475
+ "step": 1025
476
+ },
477
+ {
478
+ "entropy": 0.6568678751587868,
479
+ "epoch": 1.8716577540106951,
480
+ "grad_norm": 0.9135516285896301,
481
+ "learning_rate": 5.193700153936934e-06,
482
+ "loss": 0.6418634033203126,
483
+ "mean_token_accuracy": 0.8414819967746735,
484
+ "num_tokens": 15354311.0,
485
+ "step": 1050
486
+ },
487
+ {
488
+ "entropy": 0.6430006143450737,
489
+ "epoch": 1.9162210338680927,
490
+ "grad_norm": 0.9346958994865417,
491
+ "learning_rate": 4.842604566516537e-06,
492
+ "loss": 0.6278348541259766,
493
+ "mean_token_accuracy": 0.8434987276792526,
494
+ "num_tokens": 15721382.0,
495
+ "step": 1075
496
+ },
497
+ {
498
+ "entropy": 0.6387567144632339,
499
+ "epoch": 1.9607843137254903,
500
+ "grad_norm": 0.9693854451179504,
501
+ "learning_rate": 4.498049088708706e-06,
502
+ "loss": 0.6229427337646485,
503
+ "mean_token_accuracy": 0.8442350590229034,
504
+ "num_tokens": 16088038.0,
505
+ "step": 1100
506
+ },
507
+ {
508
+ "entropy": 0.6434592244029045,
509
+ "epoch": 2.0053475935828877,
510
+ "grad_norm": 0.9158383011817932,
511
+ "learning_rate": 4.160881705184478e-06,
512
+ "loss": 0.6287346649169921,
513
+ "mean_token_accuracy": 0.8434397971630097,
514
+ "num_tokens": 16448228.0,
515
+ "step": 1125
516
+ },
517
+ {
518
+ "entropy": 0.6293540370464324,
519
+ "epoch": 2.049910873440285,
520
+ "grad_norm": 0.9278510808944702,
521
+ "learning_rate": 3.831932217793526e-06,
522
+ "loss": 0.6089762115478515,
523
+ "mean_token_accuracy": 0.8473779886960984,
524
+ "num_tokens": 16812866.0,
525
+ "step": 1150
526
+ },
527
+ {
528
+ "entropy": 0.6246551343798638,
529
+ "epoch": 2.0944741532976825,
530
+ "grad_norm": 0.8729245066642761,
531
+ "learning_rate": 3.5120102033408053e-06,
532
+ "loss": 0.6066710281372071,
533
+ "mean_token_accuracy": 0.8471958756446838,
534
+ "num_tokens": 17177909.0,
535
+ "step": 1175
536
+ },
537
+ {
538
+ "entropy": 0.6269071605801583,
539
+ "epoch": 2.1390374331550803,
540
+ "grad_norm": 0.8709802031517029,
541
+ "learning_rate": 3.201903021138983e-06,
542
+ "loss": 0.6111587905883789,
543
+ "mean_token_accuracy": 0.8464664667844772,
544
+ "num_tokens": 17544377.0,
545
+ "step": 1200
546
+ },
547
+ {
548
+ "epoch": 2.1390374331550803,
549
+ "eval_entropy": 0.6344557646512985,
550
+ "eval_loss": 0.6462315320968628,
551
+ "eval_mean_token_accuracy": 0.8403205525875092,
552
+ "eval_num_tokens": 17544377.0,
553
+ "eval_runtime": 74.8344,
554
+ "eval_samples_per_second": 13.323,
555
+ "eval_steps_per_second": 3.341,
556
+ "step": 1200
557
+ },
558
+ {
559
+ "entropy": 0.617467094361782,
560
+ "epoch": 2.1836007130124777,
561
+ "grad_norm": 0.8771170973777771,
562
+ "learning_rate": 2.9023738752403013e-06,
563
+ "loss": 0.5986224746704102,
564
+ "mean_token_accuracy": 0.849560460448265,
565
+ "num_tokens": 17912855.0,
566
+ "step": 1225
567
+ },
568
+ {
569
+ "entropy": 0.6177873882651329,
570
+ "epoch": 2.228163992869875,
571
+ "grad_norm": 1.0253841876983643,
572
+ "learning_rate": 2.614159936116893e-06,
573
+ "loss": 0.5998103332519531,
574
+ "mean_token_accuracy": 0.8487882578372955,
575
+ "num_tokens": 18279476.0,
576
+ "step": 1250
577
+ },
578
+ {
579
+ "entropy": 0.6312283331155777,
580
+ "epoch": 2.2727272727272725,
581
+ "grad_norm": 0.9465038180351257,
582
+ "learning_rate": 2.337970526412267e-06,
583
+ "loss": 0.6118741226196289,
584
+ "mean_token_accuracy": 0.8458875006437302,
585
+ "num_tokens": 18644269.0,
586
+ "step": 1275
587
+ },
588
+ {
589
+ "entropy": 0.6209010258316994,
590
+ "epoch": 2.3172905525846703,
591
+ "grad_norm": 0.9807332158088684,
592
+ "learning_rate": 2.074485375229037e-06,
593
+ "loss": 0.6052029037475586,
594
+ "mean_token_accuracy": 0.8471564346551895,
595
+ "num_tokens": 19009107.0,
596
+ "step": 1300
597
+ },
598
+ {
599
+ "entropy": 0.6401337105035781,
600
+ "epoch": 2.3618538324420677,
601
+ "grad_norm": 1.0486506223678589,
602
+ "learning_rate": 1.82435294524924e-06,
603
+ "loss": 0.6207434463500977,
604
+ "mean_token_accuracy": 0.8439285135269166,
605
+ "num_tokens": 19374349.0,
606
+ "step": 1325
607
+ },
608
+ {
609
+ "entropy": 0.6109014016389847,
610
+ "epoch": 2.406417112299465,
611
+ "grad_norm": 0.9694714546203613,
612
+ "learning_rate": 1.5881888368043559e-06,
613
+ "loss": 0.5924215316772461,
614
+ "mean_token_accuracy": 0.8494464015960693,
615
+ "num_tokens": 19743047.0,
616
+ "step": 1350
617
+ },
618
+ {
619
+ "entropy": 0.6300237196683883,
620
+ "epoch": 2.450980392156863,
621
+ "grad_norm": 0.9961308836936951,
622
+ "learning_rate": 1.3665742728227932e-06,
623
+ "loss": 0.6133406066894531,
624
+ "mean_token_accuracy": 0.8462675029039383,
625
+ "num_tokens": 20105853.0,
626
+ "step": 1375
627
+ },
628
+ {
629
+ "entropy": 0.6148158556222916,
630
+ "epoch": 2.4955436720142603,
631
+ "grad_norm": 1.0224037170410156,
632
+ "learning_rate": 1.1600546683835065e-06,
633
+ "loss": 0.5978146362304687,
634
+ "mean_token_accuracy": 0.8488863033056259,
635
+ "num_tokens": 20469876.0,
636
+ "step": 1400
637
+ },
638
+ {
639
+ "epoch": 2.4955436720142603,
640
+ "eval_entropy": 0.627735008597374,
641
+ "eval_loss": 0.6408645510673523,
642
+ "eval_mean_token_accuracy": 0.8411034562587738,
643
+ "eval_num_tokens": 20469876.0,
644
+ "eval_runtime": 74.7955,
645
+ "eval_samples_per_second": 13.33,
646
+ "eval_steps_per_second": 3.342,
647
+ "step": 1400
648
+ }
649
+ ],
650
+ "logging_steps": 25,
651
+ "max_steps": 1683,
652
+ "num_input_tokens_seen": 0,
653
+ "num_train_epochs": 3,
654
+ "save_steps": 200,
655
+ "stateful_callbacks": {
656
+ "TrainerControl": {
657
+ "args": {
658
+ "should_epoch_stop": false,
659
+ "should_evaluate": false,
660
+ "should_log": false,
661
+ "should_save": true,
662
+ "should_training_stop": false
663
+ },
664
+ "attributes": {}
665
+ }
666
+ },
667
+ "total_flos": 9.869455724212224e+17,
668
+ "train_batch_size": 4,
669
+ "trial_name": null,
670
+ "trial_params": null
671
+ }
checkpoint-1400/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33ef6c419df9522b8fc114339f82c3e390d52dabd314191546cdbca2a738c0af
3
+ size 5585
checkpoint-1600/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen2.5-7B-Instruct
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:Qwen/Qwen2.5-7B-Instruct
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.18.1
checkpoint-1600/adapter_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 128,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 64,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "v_proj",
33
+ "k_proj",
34
+ "gate_proj",
35
+ "down_proj",
36
+ "up_proj",
37
+ "q_proj",
38
+ "o_proj"
39
+ ],
40
+ "target_parameters": null,
41
+ "task_type": "CAUSAL_LM",
42
+ "trainable_token_indices": null,
43
+ "use_dora": false,
44
+ "use_qalora": false,
45
+ "use_rslora": false
46
+ }
checkpoint-1600/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:62be16623fffeb7fff7cfa473f3f40f65a8b24e56c10115f17f126702ebd0145
3
+ size 645975704
checkpoint-1600/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
checkpoint-1600/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d64562ee726613097524ef08ae70a14248a08208bd4dbce81ba22d4e00986a6a
3
+ size 1292182139
checkpoint-1600/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bbfa4fbea9e0f3d81284f0a321de33b26f22102eb534f6f79635582e04d4f709
3
+ size 14645
checkpoint-1600/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f33245ca6596ae95a0f5c97c8ab914705616abef8b0a7e2812b61318bef5fff
3
+ size 1465
checkpoint-1600/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8
3
+ size 11421892
checkpoint-1600/tokenizer_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "backend": "tokenizers",
4
+ "bos_token": null,
5
+ "clean_up_tokenization_spaces": false,
6
+ "eos_token": "<|im_end|>",
7
+ "errors": "replace",
8
+ "extra_special_tokens": [
9
+ "<|im_start|>",
10
+ "<|im_end|>",
11
+ "<|object_ref_start|>",
12
+ "<|object_ref_end|>",
13
+ "<|box_start|>",
14
+ "<|box_end|>",
15
+ "<|quad_start|>",
16
+ "<|quad_end|>",
17
+ "<|vision_start|>",
18
+ "<|vision_end|>",
19
+ "<|vision_pad|>",
20
+ "<|image_pad|>",
21
+ "<|video_pad|>"
22
+ ],
23
+ "is_local": false,
24
+ "model_max_length": 131072,
25
+ "pad_token": "<|endoftext|>",
26
+ "padding_side": "right",
27
+ "split_special_tokens": false,
28
+ "tokenizer_class": "Qwen2Tokenizer",
29
+ "unk_token": null
30
+ }
checkpoint-1600/trainer_state.json ADDED
@@ -0,0 +1,762 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 2.8520499108734403,
6
+ "eval_steps": 200,
7
+ "global_step": 1600,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "entropy": 1.1273316520452499,
14
+ "epoch": 0.044563279857397504,
15
+ "grad_norm": 2.0025203227996826,
16
+ "learning_rate": 3.6e-06,
17
+ "loss": 2.2611521911621093,
18
+ "mean_token_accuracy": 0.6291543507575988,
19
+ "num_tokens": 363359.0,
20
+ "step": 25
21
+ },
22
+ {
23
+ "entropy": 1.391269074678421,
24
+ "epoch": 0.08912655971479501,
25
+ "grad_norm": 1.0109221935272217,
26
+ "learning_rate": 7.35e-06,
27
+ "loss": 1.767060546875,
28
+ "mean_token_accuracy": 0.6594330656528473,
29
+ "num_tokens": 724311.0,
30
+ "step": 50
31
+ },
32
+ {
33
+ "entropy": 1.2697400665283203,
34
+ "epoch": 0.13368983957219252,
35
+ "grad_norm": 0.4787505269050598,
36
+ "learning_rate": 1.11e-05,
37
+ "loss": 1.231834716796875,
38
+ "mean_token_accuracy": 0.7449960750341416,
39
+ "num_tokens": 1089859.0,
40
+ "step": 75
41
+ },
42
+ {
43
+ "entropy": 1.0446020710468291,
44
+ "epoch": 0.17825311942959002,
45
+ "grad_norm": 0.39569053053855896,
46
+ "learning_rate": 1.485e-05,
47
+ "loss": 1.0368045043945313,
48
+ "mean_token_accuracy": 0.7771743559837341,
49
+ "num_tokens": 1456575.0,
50
+ "step": 100
51
+ },
52
+ {
53
+ "entropy": 0.9694650781154632,
54
+ "epoch": 0.22281639928698752,
55
+ "grad_norm": 0.44794389605522156,
56
+ "learning_rate": 1.4991494309781894e-05,
57
+ "loss": 0.9510629272460938,
58
+ "mean_token_accuracy": 0.7904590421915054,
59
+ "num_tokens": 1819729.0,
60
+ "step": 125
61
+ },
62
+ {
63
+ "entropy": 0.9067935299873352,
64
+ "epoch": 0.26737967914438504,
65
+ "grad_norm": 0.49256861209869385,
66
+ "learning_rate": 1.4964566090257208e-05,
67
+ "loss": 0.8909156036376953,
68
+ "mean_token_accuracy": 0.8001276826858521,
69
+ "num_tokens": 2185895.0,
70
+ "step": 150
71
+ },
72
+ {
73
+ "entropy": 0.8762328952550889,
74
+ "epoch": 0.31194295900178254,
75
+ "grad_norm": 0.48932692408561707,
76
+ "learning_rate": 1.4919266844792835e-05,
77
+ "loss": 0.8628057098388672,
78
+ "mean_token_accuracy": 0.8043822544813156,
79
+ "num_tokens": 2554889.0,
80
+ "step": 175
81
+ },
82
+ {
83
+ "entropy": 0.8572803306579589,
84
+ "epoch": 0.35650623885918004,
85
+ "grad_norm": 0.5422897338867188,
86
+ "learning_rate": 1.485570805925459e-05,
87
+ "loss": 0.8397312927246093,
88
+ "mean_token_accuracy": 0.8085139858722686,
89
+ "num_tokens": 2920719.0,
90
+ "step": 200
91
+ },
92
+ {
93
+ "epoch": 0.35650623885918004,
94
+ "eval_entropy": 0.8323967654705048,
95
+ "eval_loss": 0.8291334509849548,
96
+ "eval_mean_token_accuracy": 0.8096436858177185,
97
+ "eval_num_tokens": 2920719.0,
98
+ "eval_runtime": 74.8901,
99
+ "eval_samples_per_second": 13.313,
100
+ "eval_steps_per_second": 3.338,
101
+ "step": 200
102
+ },
103
+ {
104
+ "entropy": 0.8403333276510239,
105
+ "epoch": 0.40106951871657753,
106
+ "grad_norm": 0.5801687240600586,
107
+ "learning_rate": 1.4774046158019147e-05,
108
+ "loss": 0.8235167694091797,
109
+ "mean_token_accuracy": 0.8098820477724076,
110
+ "num_tokens": 3288435.0,
111
+ "step": 225
112
+ },
113
+ {
114
+ "entropy": 0.817181087732315,
115
+ "epoch": 0.44563279857397503,
116
+ "grad_norm": 0.603880763053894,
117
+ "learning_rate": 1.467448211899838e-05,
118
+ "loss": 0.799793701171875,
119
+ "mean_token_accuracy": 0.8144050502777099,
120
+ "num_tokens": 3654202.0,
121
+ "step": 250
122
+ },
123
+ {
124
+ "entropy": 0.8009092861413956,
125
+ "epoch": 0.49019607843137253,
126
+ "grad_norm": 0.5765889883041382,
127
+ "learning_rate": 1.4557260979013106e-05,
128
+ "loss": 0.7848175811767578,
129
+ "mean_token_accuracy": 0.8172187548875809,
130
+ "num_tokens": 4016287.0,
131
+ "step": 275
132
+ },
133
+ {
134
+ "entropy": 0.8024452942609787,
135
+ "epoch": 0.5347593582887701,
136
+ "grad_norm": 0.7014256715774536,
137
+ "learning_rate": 1.4422671230733536e-05,
138
+ "loss": 0.7894332122802734,
139
+ "mean_token_accuracy": 0.8166103160381317,
140
+ "num_tokens": 4379803.0,
141
+ "step": 300
142
+ },
143
+ {
144
+ "entropy": 0.7904212397336959,
145
+ "epoch": 0.5793226381461676,
146
+ "grad_norm": 0.6543148756027222,
147
+ "learning_rate": 1.4271044112670647e-05,
148
+ "loss": 0.7738318634033203,
149
+ "mean_token_accuracy": 0.8181957858800888,
150
+ "num_tokens": 4748127.0,
151
+ "step": 325
152
+ },
153
+ {
154
+ "entropy": 0.7665162217617035,
155
+ "epoch": 0.6238859180035651,
156
+ "grad_norm": 0.7135517001152039,
157
+ "learning_rate": 1.410275279396588e-05,
158
+ "loss": 0.7533625030517578,
159
+ "mean_token_accuracy": 0.8217650431394578,
160
+ "num_tokens": 5113040.0,
161
+ "step": 350
162
+ },
163
+ {
164
+ "entropy": 0.7557546135783195,
165
+ "epoch": 0.6684491978609626,
166
+ "grad_norm": 0.6762207746505737,
167
+ "learning_rate": 1.3918211455985435e-05,
168
+ "loss": 0.7417732238769531,
169
+ "mean_token_accuracy": 0.8234544372558594,
170
+ "num_tokens": 5477938.0,
171
+ "step": 375
172
+ },
173
+ {
174
+ "entropy": 0.7480651473999024,
175
+ "epoch": 0.7130124777183601,
176
+ "grad_norm": 0.6766519546508789,
177
+ "learning_rate": 1.3717874272979488e-05,
178
+ "loss": 0.7343754577636719,
179
+ "mean_token_accuracy": 0.8247038215398789,
180
+ "num_tokens": 5846777.0,
181
+ "step": 400
182
+ },
183
+ {
184
+ "epoch": 0.7130124777183601,
185
+ "eval_entropy": 0.7494170541763305,
186
+ "eval_loss": 0.7359814047813416,
187
+ "eval_mean_token_accuracy": 0.8247466235160827,
188
+ "eval_num_tokens": 5846777.0,
189
+ "eval_runtime": 74.7841,
190
+ "eval_samples_per_second": 13.332,
191
+ "eval_steps_per_second": 3.343,
192
+ "step": 400
193
+ },
194
+ {
195
+ "entropy": 0.7388822847604751,
196
+ "epoch": 0.7575757575757576,
197
+ "grad_norm": 0.7576785683631897,
198
+ "learning_rate": 1.350223429431504e-05,
199
+ "loss": 0.7303533935546875,
200
+ "mean_token_accuracy": 0.82606416285038,
201
+ "num_tokens": 6212618.0,
202
+ "step": 425
203
+ },
204
+ {
205
+ "entropy": 0.7430421102046967,
206
+ "epoch": 0.8021390374331551,
207
+ "grad_norm": 0.7369253635406494,
208
+ "learning_rate": 1.3271822231033263e-05,
209
+ "loss": 0.7292195129394531,
210
+ "mean_token_accuracy": 0.8252584689855575,
211
+ "num_tokens": 6578764.0,
212
+ "step": 450
213
+ },
214
+ {
215
+ "entropy": 0.7350365900993348,
216
+ "epoch": 0.8467023172905526,
217
+ "grad_norm": 0.7027698755264282,
218
+ "learning_rate": 1.3027205149717825e-05,
219
+ "loss": 0.7203064727783203,
220
+ "mean_token_accuracy": 0.8271685636043549,
221
+ "num_tokens": 6940517.0,
222
+ "step": 475
223
+ },
224
+ {
225
+ "entropy": 0.7169802790880203,
226
+ "epoch": 0.8912655971479501,
227
+ "grad_norm": 0.7340224981307983,
228
+ "learning_rate": 1.276898507688866e-05,
229
+ "loss": 0.705379867553711,
230
+ "mean_token_accuracy": 0.8299148625135422,
231
+ "num_tokens": 7306466.0,
232
+ "step": 500
233
+ },
234
+ {
235
+ "entropy": 0.7128468745946884,
236
+ "epoch": 0.9358288770053476,
237
+ "grad_norm": 0.7902767658233643,
238
+ "learning_rate": 1.2497797517355924e-05,
239
+ "loss": 0.6976683807373046,
240
+ "mean_token_accuracy": 0.8309504073858262,
241
+ "num_tokens": 7675590.0,
242
+ "step": 525
243
+ },
244
+ {
245
+ "entropy": 0.7067722028493881,
246
+ "epoch": 0.9803921568627451,
247
+ "grad_norm": 0.7943085432052612,
248
+ "learning_rate": 1.2214309890180613e-05,
249
+ "loss": 0.6949668884277344,
250
+ "mean_token_accuracy": 0.8305781084299088,
251
+ "num_tokens": 8042404.0,
252
+ "step": 550
253
+ },
254
+ {
255
+ "entropy": 0.695909548997879,
256
+ "epoch": 1.0249554367201426,
257
+ "grad_norm": 0.7510514259338379,
258
+ "learning_rate": 1.191921988609109e-05,
259
+ "loss": 0.6792121124267578,
260
+ "mean_token_accuracy": 0.8343433332443237,
261
+ "num_tokens": 8403933.0,
262
+ "step": 575
263
+ },
264
+ {
265
+ "entropy": 0.6738390463590622,
266
+ "epoch": 1.0695187165775402,
267
+ "grad_norm": 0.8021165132522583,
268
+ "learning_rate": 1.1613253750398085e-05,
269
+ "loss": 0.6603101348876953,
270
+ "mean_token_accuracy": 0.8382544696331025,
271
+ "num_tokens": 8772072.0,
272
+ "step": 600
273
+ },
274
+ {
275
+ "epoch": 1.0695187165775402,
276
+ "eval_entropy": 0.6920017371177674,
277
+ "eval_loss": 0.6961521506309509,
278
+ "eval_mean_token_accuracy": 0.8314581851959228,
279
+ "eval_num_tokens": 8772072.0,
280
+ "eval_runtime": 74.8097,
281
+ "eval_samples_per_second": 13.327,
282
+ "eval_steps_per_second": 3.342,
283
+ "step": 600
284
+ },
285
+ {
286
+ "entropy": 0.6920944279432297,
287
+ "epoch": 1.1140819964349375,
288
+ "grad_norm": 0.8023701310157776,
289
+ "learning_rate": 1.1297164495634069e-05,
290
+ "loss": 0.6772218322753907,
291
+ "mean_token_accuracy": 0.8343758553266525,
292
+ "num_tokens": 9137160.0,
293
+ "step": 625
294
+ },
295
+ {
296
+ "entropy": 0.67285136282444,
297
+ "epoch": 1.1586452762923352,
298
+ "grad_norm": 0.7788256406784058,
299
+ "learning_rate": 1.0971730048315917e-05,
300
+ "loss": 0.6581203460693359,
301
+ "mean_token_accuracy": 0.8390156370401383,
302
+ "num_tokens": 9505580.0,
303
+ "step": 650
304
+ },
305
+ {
306
+ "entropy": 0.6888180702924729,
307
+ "epoch": 1.2032085561497325,
308
+ "grad_norm": 0.8268939256668091,
309
+ "learning_rate": 1.0637751334391775e-05,
310
+ "loss": 0.673553466796875,
311
+ "mean_token_accuracy": 0.8359775388240814,
312
+ "num_tokens": 9868570.0,
313
+ "step": 675
314
+ },
315
+ {
316
+ "entropy": 0.6915264892578125,
317
+ "epoch": 1.2477718360071302,
318
+ "grad_norm": 0.8361654877662659,
319
+ "learning_rate": 1.0296050308084114e-05,
320
+ "loss": 0.6790201568603516,
321
+ "mean_token_accuracy": 0.8342142343521118,
322
+ "num_tokens": 10229373.0,
323
+ "step": 700
324
+ },
325
+ {
326
+ "entropy": 0.6885707491636276,
327
+ "epoch": 1.2923351158645278,
328
+ "grad_norm": 0.7386716604232788,
329
+ "learning_rate": 9.94746792898014e-06,
330
+ "loss": 0.6720596313476562,
331
+ "mean_token_accuracy": 0.8353542894124985,
332
+ "num_tokens": 10595419.0,
333
+ "step": 725
334
+ },
335
+ {
336
+ "entropy": 0.6660267195105553,
337
+ "epoch": 1.3368983957219251,
338
+ "grad_norm": 0.7973800897598267,
339
+ "learning_rate": 9.59286209234813e-06,
340
+ "loss": 0.6550118255615235,
341
+ "mean_token_accuracy": 0.8386269718408584,
342
+ "num_tokens": 10960517.0,
343
+ "step": 750
344
+ },
345
+ {
346
+ "entropy": 0.6469692060351372,
347
+ "epoch": 1.3814616755793225,
348
+ "grad_norm": 0.798152506351471,
349
+ "learning_rate": 9.233105517773445e-06,
350
+ "loss": 0.6308420181274415,
351
+ "mean_token_accuracy": 0.8429271316528321,
352
+ "num_tokens": 11328702.0,
353
+ "step": 775
354
+ },
355
+ {
356
+ "entropy": 0.6708013540506363,
357
+ "epoch": 1.4260249554367201,
358
+ "grad_norm": 0.9537823796272278,
359
+ "learning_rate": 8.869083601310398e-06,
360
+ "loss": 0.6537622833251953,
361
+ "mean_token_accuracy": 0.838316883444786,
362
+ "num_tokens": 11697546.0,
363
+ "step": 800
364
+ },
365
+ {
366
+ "epoch": 1.4260249554367201,
367
+ "eval_entropy": 0.670824561715126,
368
+ "eval_loss": 0.6723578572273254,
369
+ "eval_mean_token_accuracy": 0.8353032109737396,
370
+ "eval_num_tokens": 11697546.0,
371
+ "eval_runtime": 74.7664,
372
+ "eval_samples_per_second": 13.335,
373
+ "eval_steps_per_second": 3.344,
374
+ "step": 800
375
+ },
376
+ {
377
+ "entropy": 0.659270493388176,
378
+ "epoch": 1.4705882352941178,
379
+ "grad_norm": 0.846034586429596,
380
+ "learning_rate": 8.501692236436132e-06,
381
+ "loss": 0.6444293212890625,
382
+ "mean_token_accuracy": 0.8404667204618455,
383
+ "num_tokens": 12061827.0,
384
+ "step": 825
385
+ },
386
+ {
387
+ "entropy": 0.6627422112226486,
388
+ "epoch": 1.5151515151515151,
389
+ "grad_norm": 0.9181033968925476,
390
+ "learning_rate": 8.131835609169295e-06,
391
+ "loss": 0.6494012451171876,
392
+ "mean_token_accuracy": 0.839583694934845,
393
+ "num_tokens": 12427853.0,
394
+ "step": 850
395
+ },
396
+ {
397
+ "entropy": 0.6641036707162857,
398
+ "epoch": 1.5597147950089125,
399
+ "grad_norm": 0.858001172542572,
400
+ "learning_rate": 7.760423972779985e-06,
401
+ "loss": 0.6495742797851562,
402
+ "mean_token_accuracy": 0.8395592844486237,
403
+ "num_tokens": 12799973.0,
404
+ "step": 875
405
+ },
406
+ {
407
+ "entropy": 0.6689085793495179,
408
+ "epoch": 1.6042780748663101,
409
+ "grad_norm": 0.8615349531173706,
410
+ "learning_rate": 7.388371407567565e-06,
411
+ "loss": 0.6532559967041016,
412
+ "mean_token_accuracy": 0.8388407498598098,
413
+ "num_tokens": 13166796.0,
414
+ "step": 900
415
+ },
416
+ {
417
+ "entropy": 0.6729245400428772,
418
+ "epoch": 1.6488413547237077,
419
+ "grad_norm": 0.831142783164978,
420
+ "learning_rate": 7.01659357121981e-06,
421
+ "loss": 0.6572090911865235,
422
+ "mean_token_accuracy": 0.8372052818536758,
423
+ "num_tokens": 13532499.0,
424
+ "step": 925
425
+ },
426
+ {
427
+ "entropy": 0.6538485777378082,
428
+ "epoch": 1.6934046345811051,
429
+ "grad_norm": 0.919346809387207,
430
+ "learning_rate": 6.6460054452899315e-06,
431
+ "loss": 0.6404708862304688,
432
+ "mean_token_accuracy": 0.8411308795213699,
433
+ "num_tokens": 13898404.0,
434
+ "step": 950
435
+ },
436
+ {
437
+ "entropy": 0.6691750481724739,
438
+ "epoch": 1.7379679144385025,
439
+ "grad_norm": 0.9280221462249756,
440
+ "learning_rate": 6.277519083337656e-06,
441
+ "loss": 0.6546466827392579,
442
+ "mean_token_accuracy": 0.838825848698616,
443
+ "num_tokens": 14261658.0,
444
+ "step": 975
445
+ },
446
+ {
447
+ "entropy": 0.6536609560251236,
448
+ "epoch": 1.7825311942959001,
449
+ "grad_norm": 0.9000495076179504,
450
+ "learning_rate": 5.9120413662763545e-06,
451
+ "loss": 0.6405950927734375,
452
+ "mean_token_accuracy": 0.8412596487998962,
453
+ "num_tokens": 14625008.0,
454
+ "step": 1000
455
+ },
456
+ {
457
+ "epoch": 1.7825311942959001,
458
+ "eval_entropy": 0.6716028243303299,
459
+ "eval_loss": 0.6561057567596436,
460
+ "eval_mean_token_accuracy": 0.8381222817897797,
461
+ "eval_num_tokens": 14625008.0,
462
+ "eval_runtime": 74.7617,
463
+ "eval_samples_per_second": 13.336,
464
+ "eval_steps_per_second": 3.344,
465
+ "step": 1000
466
+ },
467
+ {
468
+ "entropy": 0.6671841683983802,
469
+ "epoch": 1.8270944741532977,
470
+ "grad_norm": 0.8711400628089905,
471
+ "learning_rate": 5.550471770450572e-06,
472
+ "loss": 0.6500684356689453,
473
+ "mean_token_accuracy": 0.8389109486341476,
474
+ "num_tokens": 14985559.0,
475
+ "step": 1025
476
+ },
477
+ {
478
+ "entropy": 0.6568678751587868,
479
+ "epoch": 1.8716577540106951,
480
+ "grad_norm": 0.9135516285896301,
481
+ "learning_rate": 5.193700153936934e-06,
482
+ "loss": 0.6418634033203126,
483
+ "mean_token_accuracy": 0.8414819967746735,
484
+ "num_tokens": 15354311.0,
485
+ "step": 1050
486
+ },
487
+ {
488
+ "entropy": 0.6430006143450737,
489
+ "epoch": 1.9162210338680927,
490
+ "grad_norm": 0.9346958994865417,
491
+ "learning_rate": 4.842604566516537e-06,
492
+ "loss": 0.6278348541259766,
493
+ "mean_token_accuracy": 0.8434987276792526,
494
+ "num_tokens": 15721382.0,
495
+ "step": 1075
496
+ },
497
+ {
498
+ "entropy": 0.6387567144632339,
499
+ "epoch": 1.9607843137254903,
500
+ "grad_norm": 0.9693854451179504,
501
+ "learning_rate": 4.498049088708706e-06,
502
+ "loss": 0.6229427337646485,
503
+ "mean_token_accuracy": 0.8442350590229034,
504
+ "num_tokens": 16088038.0,
505
+ "step": 1100
506
+ },
507
+ {
508
+ "entropy": 0.6434592244029045,
509
+ "epoch": 2.0053475935828877,
510
+ "grad_norm": 0.9158383011817932,
511
+ "learning_rate": 4.160881705184478e-06,
512
+ "loss": 0.6287346649169921,
513
+ "mean_token_accuracy": 0.8434397971630097,
514
+ "num_tokens": 16448228.0,
515
+ "step": 1125
516
+ },
517
+ {
518
+ "entropy": 0.6293540370464324,
519
+ "epoch": 2.049910873440285,
520
+ "grad_norm": 0.9278510808944702,
521
+ "learning_rate": 3.831932217793526e-06,
522
+ "loss": 0.6089762115478515,
523
+ "mean_token_accuracy": 0.8473779886960984,
524
+ "num_tokens": 16812866.0,
525
+ "step": 1150
526
+ },
527
+ {
528
+ "entropy": 0.6246551343798638,
529
+ "epoch": 2.0944741532976825,
530
+ "grad_norm": 0.8729245066642761,
531
+ "learning_rate": 3.5120102033408053e-06,
532
+ "loss": 0.6066710281372071,
533
+ "mean_token_accuracy": 0.8471958756446838,
534
+ "num_tokens": 17177909.0,
535
+ "step": 1175
536
+ },
537
+ {
538
+ "entropy": 0.6269071605801583,
539
+ "epoch": 2.1390374331550803,
540
+ "grad_norm": 0.8709802031517029,
541
+ "learning_rate": 3.201903021138983e-06,
542
+ "loss": 0.6111587905883789,
543
+ "mean_token_accuracy": 0.8464664667844772,
544
+ "num_tokens": 17544377.0,
545
+ "step": 1200
546
+ },
547
+ {
548
+ "epoch": 2.1390374331550803,
549
+ "eval_entropy": 0.6344557646512985,
550
+ "eval_loss": 0.6462315320968628,
551
+ "eval_mean_token_accuracy": 0.8403205525875092,
552
+ "eval_num_tokens": 17544377.0,
553
+ "eval_runtime": 74.8344,
554
+ "eval_samples_per_second": 13.323,
555
+ "eval_steps_per_second": 3.341,
556
+ "step": 1200
557
+ },
558
+ {
559
+ "entropy": 0.617467094361782,
560
+ "epoch": 2.1836007130124777,
561
+ "grad_norm": 0.8771170973777771,
562
+ "learning_rate": 2.9023738752403013e-06,
563
+ "loss": 0.5986224746704102,
564
+ "mean_token_accuracy": 0.849560460448265,
565
+ "num_tokens": 17912855.0,
566
+ "step": 1225
567
+ },
568
+ {
569
+ "entropy": 0.6177873882651329,
570
+ "epoch": 2.228163992869875,
571
+ "grad_norm": 1.0253841876983643,
572
+ "learning_rate": 2.614159936116893e-06,
573
+ "loss": 0.5998103332519531,
574
+ "mean_token_accuracy": 0.8487882578372955,
575
+ "num_tokens": 18279476.0,
576
+ "step": 1250
577
+ },
578
+ {
579
+ "entropy": 0.6312283331155777,
580
+ "epoch": 2.2727272727272725,
581
+ "grad_norm": 0.9465038180351257,
582
+ "learning_rate": 2.337970526412267e-06,
583
+ "loss": 0.6118741226196289,
584
+ "mean_token_accuracy": 0.8458875006437302,
585
+ "num_tokens": 18644269.0,
586
+ "step": 1275
587
+ },
588
+ {
589
+ "entropy": 0.6209010258316994,
590
+ "epoch": 2.3172905525846703,
591
+ "grad_norm": 0.9807332158088684,
592
+ "learning_rate": 2.074485375229037e-06,
593
+ "loss": 0.6052029037475586,
594
+ "mean_token_accuracy": 0.8471564346551895,
595
+ "num_tokens": 19009107.0,
596
+ "step": 1300
597
+ },
598
+ {
599
+ "entropy": 0.6401337105035781,
600
+ "epoch": 2.3618538324420677,
601
+ "grad_norm": 1.0486506223678589,
602
+ "learning_rate": 1.82435294524924e-06,
603
+ "loss": 0.6207434463500977,
604
+ "mean_token_accuracy": 0.8439285135269166,
605
+ "num_tokens": 19374349.0,
606
+ "step": 1325
607
+ },
608
+ {
609
+ "entropy": 0.6109014016389847,
610
+ "epoch": 2.406417112299465,
611
+ "grad_norm": 0.9694714546203613,
612
+ "learning_rate": 1.5881888368043559e-06,
613
+ "loss": 0.5924215316772461,
614
+ "mean_token_accuracy": 0.8494464015960693,
615
+ "num_tokens": 19743047.0,
616
+ "step": 1350
617
+ },
618
+ {
619
+ "entropy": 0.6300237196683883,
620
+ "epoch": 2.450980392156863,
621
+ "grad_norm": 0.9961308836936951,
622
+ "learning_rate": 1.3665742728227932e-06,
623
+ "loss": 0.6133406066894531,
624
+ "mean_token_accuracy": 0.8462675029039383,
625
+ "num_tokens": 20105853.0,
626
+ "step": 1375
627
+ },
628
+ {
629
+ "entropy": 0.6148158556222916,
630
+ "epoch": 2.4955436720142603,
631
+ "grad_norm": 1.0224037170410156,
632
+ "learning_rate": 1.1600546683835065e-06,
633
+ "loss": 0.5978146362304687,
634
+ "mean_token_accuracy": 0.8488863033056259,
635
+ "num_tokens": 20469876.0,
636
+ "step": 1400
637
+ },
638
+ {
639
+ "epoch": 2.4955436720142603,
640
+ "eval_entropy": 0.627735008597374,
641
+ "eval_loss": 0.6408645510673523,
642
+ "eval_mean_token_accuracy": 0.8411034562587738,
643
+ "eval_num_tokens": 20469876.0,
644
+ "eval_runtime": 74.7955,
645
+ "eval_samples_per_second": 13.33,
646
+ "eval_steps_per_second": 3.342,
647
+ "step": 1400
648
+ },
649
+ {
650
+ "entropy": 0.6221208718419075,
651
+ "epoch": 2.5401069518716577,
652
+ "grad_norm": 1.0483691692352295,
653
+ "learning_rate": 9.691382883962515e-07,
654
+ "loss": 0.6043234634399414,
655
+ "mean_token_accuracy": 0.8475923782587051,
656
+ "num_tokens": 20834908.0,
657
+ "step": 1425
658
+ },
659
+ {
660
+ "entropy": 0.6163172733783722,
661
+ "epoch": 2.5846702317290555,
662
+ "grad_norm": 1.0169743299484253,
663
+ "learning_rate": 7.942949967120098e-07,
664
+ "loss": 0.6007443237304687,
665
+ "mean_token_accuracy": 0.8487154805660247,
666
+ "num_tokens": 21199575.0,
667
+ "step": 1450
668
+ },
669
+ {
670
+ "entropy": 0.6306376928091049,
671
+ "epoch": 2.629233511586453,
672
+ "grad_norm": 0.9749926328659058,
673
+ "learning_rate": 6.359550997421698e-07,
674
+ "loss": 0.6101107406616211,
675
+ "mean_token_accuracy": 0.8469714081287384,
676
+ "num_tokens": 21564414.0,
677
+ "step": 1475
678
+ },
679
+ {
680
+ "entropy": 0.6135326558351517,
681
+ "epoch": 2.6737967914438503,
682
+ "grad_norm": 1.0116835832595825,
683
+ "learning_rate": 4.945082874324541e-07,
684
+ "loss": 0.5956003189086914,
685
+ "mean_token_accuracy": 0.8500852519273758,
686
+ "num_tokens": 21928080.0,
687
+ "step": 1500
688
+ },
689
+ {
690
+ "entropy": 0.6165187922120094,
691
+ "epoch": 2.7183600713012477,
692
+ "grad_norm": 0.9928510785102844,
693
+ "learning_rate": 3.7030267419789764e-07,
694
+ "loss": 0.6013864135742187,
695
+ "mean_token_accuracy": 0.8494158619642258,
696
+ "num_tokens": 22296207.0,
697
+ "step": 1525
698
+ },
699
+ {
700
+ "entropy": 0.619775217473507,
701
+ "epoch": 2.762923351158645,
702
+ "grad_norm": 0.9901552796363831,
703
+ "learning_rate": 2.6364394217929856e-07,
704
+ "loss": 0.6034153366088867,
705
+ "mean_token_accuracy": 0.8480428576469421,
706
+ "num_tokens": 22661645.0,
707
+ "step": 1550
708
+ },
709
+ {
710
+ "entropy": 0.6086369237303734,
711
+ "epoch": 2.807486631016043,
712
+ "grad_norm": 0.8772838711738586,
713
+ "learning_rate": 1.7479458892961846e-07,
714
+ "loss": 0.5885520553588868,
715
+ "mean_token_accuracy": 0.8515380412340164,
716
+ "num_tokens": 23028234.0,
717
+ "step": 1575
718
+ },
719
+ {
720
+ "entropy": 0.6151553666591645,
721
+ "epoch": 2.8520499108734403,
722
+ "grad_norm": 0.9948622584342957,
723
+ "learning_rate": 1.0397328138187557e-07,
724
+ "loss": 0.5963270568847656,
725
+ "mean_token_accuracy": 0.8506602907180786,
726
+ "num_tokens": 23393799.0,
727
+ "step": 1600
728
+ },
729
+ {
730
+ "epoch": 2.8520499108734403,
731
+ "eval_entropy": 0.626643338561058,
732
+ "eval_loss": 0.6390902400016785,
733
+ "eval_mean_token_accuracy": 0.8414715526103973,
734
+ "eval_num_tokens": 23393799.0,
735
+ "eval_runtime": 74.7969,
736
+ "eval_samples_per_second": 13.329,
737
+ "eval_steps_per_second": 3.342,
738
+ "step": 1600
739
+ }
740
+ ],
741
+ "logging_steps": 25,
742
+ "max_steps": 1683,
743
+ "num_input_tokens_seen": 0,
744
+ "num_train_epochs": 3,
745
+ "save_steps": 200,
746
+ "stateful_callbacks": {
747
+ "TrainerControl": {
748
+ "args": {
749
+ "should_epoch_stop": false,
750
+ "should_evaluate": false,
751
+ "should_log": false,
752
+ "should_save": true,
753
+ "should_training_stop": false
754
+ },
755
+ "attributes": {}
756
+ }
757
+ },
758
+ "total_flos": 1.1278684184859034e+18,
759
+ "train_batch_size": 4,
760
+ "trial_name": null,
761
+ "trial_params": null
762
+ }
checkpoint-1600/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33ef6c419df9522b8fc114339f82c3e390d52dabd314191546cdbca2a738c0af
3
+ size 5585
checkpoint-1683/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen2.5-7B-Instruct
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:Qwen/Qwen2.5-7B-Instruct
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.18.1
checkpoint-1683/adapter_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 128,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 64,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "v_proj",
33
+ "k_proj",
34
+ "gate_proj",
35
+ "down_proj",
36
+ "up_proj",
37
+ "q_proj",
38
+ "o_proj"
39
+ ],
40
+ "target_parameters": null,
41
+ "task_type": "CAUSAL_LM",
42
+ "trainable_token_indices": null,
43
+ "use_dora": false,
44
+ "use_qalora": false,
45
+ "use_rslora": false
46
+ }
checkpoint-1683/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be15156e4206b36e88697f973f0757a6eb2e18abadf49ac66348796353b26c7c
3
+ size 645975704
checkpoint-1683/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
checkpoint-1683/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e6d833bf1cb9a5f8f663f12115ab5eb20f4a51a30008dd5ca9f77cbaf44b23b
3
+ size 1292182139
checkpoint-1683/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19a191961b95a03b05655966beedb7977207e4e2a61b0fb5a169be43daad40f4
3
+ size 14645
checkpoint-1683/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:85d2044299f996170d57bec12325224a16707d54a6bba50a223724ae1ebb0267
3
+ size 1465
checkpoint-1683/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8
3
+ size 11421892
checkpoint-1683/tokenizer_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "backend": "tokenizers",
4
+ "bos_token": null,
5
+ "clean_up_tokenization_spaces": false,
6
+ "eos_token": "<|im_end|>",
7
+ "errors": "replace",
8
+ "extra_special_tokens": [
9
+ "<|im_start|>",
10
+ "<|im_end|>",
11
+ "<|object_ref_start|>",
12
+ "<|object_ref_end|>",
13
+ "<|box_start|>",
14
+ "<|box_end|>",
15
+ "<|quad_start|>",
16
+ "<|quad_end|>",
17
+ "<|vision_start|>",
18
+ "<|vision_end|>",
19
+ "<|vision_pad|>",
20
+ "<|image_pad|>",
21
+ "<|video_pad|>"
22
+ ],
23
+ "is_local": false,
24
+ "model_max_length": 131072,
25
+ "pad_token": "<|endoftext|>",
26
+ "padding_side": "right",
27
+ "split_special_tokens": false,
28
+ "tokenizer_class": "Qwen2Tokenizer",
29
+ "unk_token": null
30
+ }
checkpoint-1683/trainer_state.json ADDED
@@ -0,0 +1,792 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 3.0,
6
+ "eval_steps": 200,
7
+ "global_step": 1683,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "entropy": 1.1273316520452499,
14
+ "epoch": 0.044563279857397504,
15
+ "grad_norm": 2.0025203227996826,
16
+ "learning_rate": 3.6e-06,
17
+ "loss": 2.2611521911621093,
18
+ "mean_token_accuracy": 0.6291543507575988,
19
+ "num_tokens": 363359.0,
20
+ "step": 25
21
+ },
22
+ {
23
+ "entropy": 1.391269074678421,
24
+ "epoch": 0.08912655971479501,
25
+ "grad_norm": 1.0109221935272217,
26
+ "learning_rate": 7.35e-06,
27
+ "loss": 1.767060546875,
28
+ "mean_token_accuracy": 0.6594330656528473,
29
+ "num_tokens": 724311.0,
30
+ "step": 50
31
+ },
32
+ {
33
+ "entropy": 1.2697400665283203,
34
+ "epoch": 0.13368983957219252,
35
+ "grad_norm": 0.4787505269050598,
36
+ "learning_rate": 1.11e-05,
37
+ "loss": 1.231834716796875,
38
+ "mean_token_accuracy": 0.7449960750341416,
39
+ "num_tokens": 1089859.0,
40
+ "step": 75
41
+ },
42
+ {
43
+ "entropy": 1.0446020710468291,
44
+ "epoch": 0.17825311942959002,
45
+ "grad_norm": 0.39569053053855896,
46
+ "learning_rate": 1.485e-05,
47
+ "loss": 1.0368045043945313,
48
+ "mean_token_accuracy": 0.7771743559837341,
49
+ "num_tokens": 1456575.0,
50
+ "step": 100
51
+ },
52
+ {
53
+ "entropy": 0.9694650781154632,
54
+ "epoch": 0.22281639928698752,
55
+ "grad_norm": 0.44794389605522156,
56
+ "learning_rate": 1.4991494309781894e-05,
57
+ "loss": 0.9510629272460938,
58
+ "mean_token_accuracy": 0.7904590421915054,
59
+ "num_tokens": 1819729.0,
60
+ "step": 125
61
+ },
62
+ {
63
+ "entropy": 0.9067935299873352,
64
+ "epoch": 0.26737967914438504,
65
+ "grad_norm": 0.49256861209869385,
66
+ "learning_rate": 1.4964566090257208e-05,
67
+ "loss": 0.8909156036376953,
68
+ "mean_token_accuracy": 0.8001276826858521,
69
+ "num_tokens": 2185895.0,
70
+ "step": 150
71
+ },
72
+ {
73
+ "entropy": 0.8762328952550889,
74
+ "epoch": 0.31194295900178254,
75
+ "grad_norm": 0.48932692408561707,
76
+ "learning_rate": 1.4919266844792835e-05,
77
+ "loss": 0.8628057098388672,
78
+ "mean_token_accuracy": 0.8043822544813156,
79
+ "num_tokens": 2554889.0,
80
+ "step": 175
81
+ },
82
+ {
83
+ "entropy": 0.8572803306579589,
84
+ "epoch": 0.35650623885918004,
85
+ "grad_norm": 0.5422897338867188,
86
+ "learning_rate": 1.485570805925459e-05,
87
+ "loss": 0.8397312927246093,
88
+ "mean_token_accuracy": 0.8085139858722686,
89
+ "num_tokens": 2920719.0,
90
+ "step": 200
91
+ },
92
+ {
93
+ "epoch": 0.35650623885918004,
94
+ "eval_entropy": 0.8323967654705048,
95
+ "eval_loss": 0.8291334509849548,
96
+ "eval_mean_token_accuracy": 0.8096436858177185,
97
+ "eval_num_tokens": 2920719.0,
98
+ "eval_runtime": 74.8901,
99
+ "eval_samples_per_second": 13.313,
100
+ "eval_steps_per_second": 3.338,
101
+ "step": 200
102
+ },
103
+ {
104
+ "entropy": 0.8403333276510239,
105
+ "epoch": 0.40106951871657753,
106
+ "grad_norm": 0.5801687240600586,
107
+ "learning_rate": 1.4774046158019147e-05,
108
+ "loss": 0.8235167694091797,
109
+ "mean_token_accuracy": 0.8098820477724076,
110
+ "num_tokens": 3288435.0,
111
+ "step": 225
112
+ },
113
+ {
114
+ "entropy": 0.817181087732315,
115
+ "epoch": 0.44563279857397503,
116
+ "grad_norm": 0.603880763053894,
117
+ "learning_rate": 1.467448211899838e-05,
118
+ "loss": 0.799793701171875,
119
+ "mean_token_accuracy": 0.8144050502777099,
120
+ "num_tokens": 3654202.0,
121
+ "step": 250
122
+ },
123
+ {
124
+ "entropy": 0.8009092861413956,
125
+ "epoch": 0.49019607843137253,
126
+ "grad_norm": 0.5765889883041382,
127
+ "learning_rate": 1.4557260979013106e-05,
128
+ "loss": 0.7848175811767578,
129
+ "mean_token_accuracy": 0.8172187548875809,
130
+ "num_tokens": 4016287.0,
131
+ "step": 275
132
+ },
133
+ {
134
+ "entropy": 0.8024452942609787,
135
+ "epoch": 0.5347593582887701,
136
+ "grad_norm": 0.7014256715774536,
137
+ "learning_rate": 1.4422671230733536e-05,
138
+ "loss": 0.7894332122802734,
139
+ "mean_token_accuracy": 0.8166103160381317,
140
+ "num_tokens": 4379803.0,
141
+ "step": 300
142
+ },
143
+ {
144
+ "entropy": 0.7904212397336959,
145
+ "epoch": 0.5793226381461676,
146
+ "grad_norm": 0.6543148756027222,
147
+ "learning_rate": 1.4271044112670647e-05,
148
+ "loss": 0.7738318634033203,
149
+ "mean_token_accuracy": 0.8181957858800888,
150
+ "num_tokens": 4748127.0,
151
+ "step": 325
152
+ },
153
+ {
154
+ "entropy": 0.7665162217617035,
155
+ "epoch": 0.6238859180035651,
156
+ "grad_norm": 0.7135517001152039,
157
+ "learning_rate": 1.410275279396588e-05,
158
+ "loss": 0.7533625030517578,
159
+ "mean_token_accuracy": 0.8217650431394578,
160
+ "num_tokens": 5113040.0,
161
+ "step": 350
162
+ },
163
+ {
164
+ "entropy": 0.7557546135783195,
165
+ "epoch": 0.6684491978609626,
166
+ "grad_norm": 0.6762207746505737,
167
+ "learning_rate": 1.3918211455985435e-05,
168
+ "loss": 0.7417732238769531,
169
+ "mean_token_accuracy": 0.8234544372558594,
170
+ "num_tokens": 5477938.0,
171
+ "step": 375
172
+ },
173
+ {
174
+ "entropy": 0.7480651473999024,
175
+ "epoch": 0.7130124777183601,
176
+ "grad_norm": 0.6766519546508789,
177
+ "learning_rate": 1.3717874272979488e-05,
178
+ "loss": 0.7343754577636719,
179
+ "mean_token_accuracy": 0.8247038215398789,
180
+ "num_tokens": 5846777.0,
181
+ "step": 400
182
+ },
183
+ {
184
+ "epoch": 0.7130124777183601,
185
+ "eval_entropy": 0.7494170541763305,
186
+ "eval_loss": 0.7359814047813416,
187
+ "eval_mean_token_accuracy": 0.8247466235160827,
188
+ "eval_num_tokens": 5846777.0,
189
+ "eval_runtime": 74.7841,
190
+ "eval_samples_per_second": 13.332,
191
+ "eval_steps_per_second": 3.343,
192
+ "step": 400
193
+ },
194
+ {
195
+ "entropy": 0.7388822847604751,
196
+ "epoch": 0.7575757575757576,
197
+ "grad_norm": 0.7576785683631897,
198
+ "learning_rate": 1.350223429431504e-05,
199
+ "loss": 0.7303533935546875,
200
+ "mean_token_accuracy": 0.82606416285038,
201
+ "num_tokens": 6212618.0,
202
+ "step": 425
203
+ },
204
+ {
205
+ "entropy": 0.7430421102046967,
206
+ "epoch": 0.8021390374331551,
207
+ "grad_norm": 0.7369253635406494,
208
+ "learning_rate": 1.3271822231033263e-05,
209
+ "loss": 0.7292195129394531,
210
+ "mean_token_accuracy": 0.8252584689855575,
211
+ "num_tokens": 6578764.0,
212
+ "step": 450
213
+ },
214
+ {
215
+ "entropy": 0.7350365900993348,
216
+ "epoch": 0.8467023172905526,
217
+ "grad_norm": 0.7027698755264282,
218
+ "learning_rate": 1.3027205149717825e-05,
219
+ "loss": 0.7203064727783203,
220
+ "mean_token_accuracy": 0.8271685636043549,
221
+ "num_tokens": 6940517.0,
222
+ "step": 475
223
+ },
224
+ {
225
+ "entropy": 0.7169802790880203,
226
+ "epoch": 0.8912655971479501,
227
+ "grad_norm": 0.7340224981307983,
228
+ "learning_rate": 1.276898507688866e-05,
229
+ "loss": 0.705379867553711,
230
+ "mean_token_accuracy": 0.8299148625135422,
231
+ "num_tokens": 7306466.0,
232
+ "step": 500
233
+ },
234
+ {
235
+ "entropy": 0.7128468745946884,
236
+ "epoch": 0.9358288770053476,
237
+ "grad_norm": 0.7902767658233643,
238
+ "learning_rate": 1.2497797517355924e-05,
239
+ "loss": 0.6976683807373046,
240
+ "mean_token_accuracy": 0.8309504073858262,
241
+ "num_tokens": 7675590.0,
242
+ "step": 525
243
+ },
244
+ {
245
+ "entropy": 0.7067722028493881,
246
+ "epoch": 0.9803921568627451,
247
+ "grad_norm": 0.7943085432052612,
248
+ "learning_rate": 1.2214309890180613e-05,
249
+ "loss": 0.6949668884277344,
250
+ "mean_token_accuracy": 0.8305781084299088,
251
+ "num_tokens": 8042404.0,
252
+ "step": 550
253
+ },
254
+ {
255
+ "entropy": 0.695909548997879,
256
+ "epoch": 1.0249554367201426,
257
+ "grad_norm": 0.7510514259338379,
258
+ "learning_rate": 1.191921988609109e-05,
259
+ "loss": 0.6792121124267578,
260
+ "mean_token_accuracy": 0.8343433332443237,
261
+ "num_tokens": 8403933.0,
262
+ "step": 575
263
+ },
264
+ {
265
+ "entropy": 0.6738390463590622,
266
+ "epoch": 1.0695187165775402,
267
+ "grad_norm": 0.8021165132522583,
268
+ "learning_rate": 1.1613253750398085e-05,
269
+ "loss": 0.6603101348876953,
270
+ "mean_token_accuracy": 0.8382544696331025,
271
+ "num_tokens": 8772072.0,
272
+ "step": 600
273
+ },
274
+ {
275
+ "epoch": 1.0695187165775402,
276
+ "eval_entropy": 0.6920017371177674,
277
+ "eval_loss": 0.6961521506309509,
278
+ "eval_mean_token_accuracy": 0.8314581851959228,
279
+ "eval_num_tokens": 8772072.0,
280
+ "eval_runtime": 74.8097,
281
+ "eval_samples_per_second": 13.327,
282
+ "eval_steps_per_second": 3.342,
283
+ "step": 600
284
+ },
285
+ {
286
+ "entropy": 0.6920944279432297,
287
+ "epoch": 1.1140819964349375,
288
+ "grad_norm": 0.8023701310157776,
289
+ "learning_rate": 1.1297164495634069e-05,
290
+ "loss": 0.6772218322753907,
291
+ "mean_token_accuracy": 0.8343758553266525,
292
+ "num_tokens": 9137160.0,
293
+ "step": 625
294
+ },
295
+ {
296
+ "entropy": 0.67285136282444,
297
+ "epoch": 1.1586452762923352,
298
+ "grad_norm": 0.7788256406784058,
299
+ "learning_rate": 1.0971730048315917e-05,
300
+ "loss": 0.6581203460693359,
301
+ "mean_token_accuracy": 0.8390156370401383,
302
+ "num_tokens": 9505580.0,
303
+ "step": 650
304
+ },
305
+ {
306
+ "entropy": 0.6888180702924729,
307
+ "epoch": 1.2032085561497325,
308
+ "grad_norm": 0.8268939256668091,
309
+ "learning_rate": 1.0637751334391775e-05,
310
+ "loss": 0.673553466796875,
311
+ "mean_token_accuracy": 0.8359775388240814,
312
+ "num_tokens": 9868570.0,
313
+ "step": 675
314
+ },
315
+ {
316
+ "entropy": 0.6915264892578125,
317
+ "epoch": 1.2477718360071302,
318
+ "grad_norm": 0.8361654877662659,
319
+ "learning_rate": 1.0296050308084114e-05,
320
+ "loss": 0.6790201568603516,
321
+ "mean_token_accuracy": 0.8342142343521118,
322
+ "num_tokens": 10229373.0,
323
+ "step": 700
324
+ },
325
+ {
326
+ "entropy": 0.6885707491636276,
327
+ "epoch": 1.2923351158645278,
328
+ "grad_norm": 0.7386716604232788,
329
+ "learning_rate": 9.94746792898014e-06,
330
+ "loss": 0.6720596313476562,
331
+ "mean_token_accuracy": 0.8353542894124985,
332
+ "num_tokens": 10595419.0,
333
+ "step": 725
334
+ },
335
+ {
336
+ "entropy": 0.6660267195105553,
337
+ "epoch": 1.3368983957219251,
338
+ "grad_norm": 0.7973800897598267,
339
+ "learning_rate": 9.59286209234813e-06,
340
+ "loss": 0.6550118255615235,
341
+ "mean_token_accuracy": 0.8386269718408584,
342
+ "num_tokens": 10960517.0,
343
+ "step": 750
344
+ },
345
+ {
346
+ "entropy": 0.6469692060351372,
347
+ "epoch": 1.3814616755793225,
348
+ "grad_norm": 0.798152506351471,
349
+ "learning_rate": 9.233105517773445e-06,
350
+ "loss": 0.6308420181274415,
351
+ "mean_token_accuracy": 0.8429271316528321,
352
+ "num_tokens": 11328702.0,
353
+ "step": 775
354
+ },
355
+ {
356
+ "entropy": 0.6708013540506363,
357
+ "epoch": 1.4260249554367201,
358
+ "grad_norm": 0.9537823796272278,
359
+ "learning_rate": 8.869083601310398e-06,
360
+ "loss": 0.6537622833251953,
361
+ "mean_token_accuracy": 0.838316883444786,
362
+ "num_tokens": 11697546.0,
363
+ "step": 800
364
+ },
365
+ {
366
+ "epoch": 1.4260249554367201,
367
+ "eval_entropy": 0.670824561715126,
368
+ "eval_loss": 0.6723578572273254,
369
+ "eval_mean_token_accuracy": 0.8353032109737396,
370
+ "eval_num_tokens": 11697546.0,
371
+ "eval_runtime": 74.7664,
372
+ "eval_samples_per_second": 13.335,
373
+ "eval_steps_per_second": 3.344,
374
+ "step": 800
375
+ },
376
+ {
377
+ "entropy": 0.659270493388176,
378
+ "epoch": 1.4705882352941178,
379
+ "grad_norm": 0.846034586429596,
380
+ "learning_rate": 8.501692236436132e-06,
381
+ "loss": 0.6444293212890625,
382
+ "mean_token_accuracy": 0.8404667204618455,
383
+ "num_tokens": 12061827.0,
384
+ "step": 825
385
+ },
386
+ {
387
+ "entropy": 0.6627422112226486,
388
+ "epoch": 1.5151515151515151,
389
+ "grad_norm": 0.9181033968925476,
390
+ "learning_rate": 8.131835609169295e-06,
391
+ "loss": 0.6494012451171876,
392
+ "mean_token_accuracy": 0.839583694934845,
393
+ "num_tokens": 12427853.0,
394
+ "step": 850
395
+ },
396
+ {
397
+ "entropy": 0.6641036707162857,
398
+ "epoch": 1.5597147950089125,
399
+ "grad_norm": 0.858001172542572,
400
+ "learning_rate": 7.760423972779985e-06,
401
+ "loss": 0.6495742797851562,
402
+ "mean_token_accuracy": 0.8395592844486237,
403
+ "num_tokens": 12799973.0,
404
+ "step": 875
405
+ },
406
+ {
407
+ "entropy": 0.6689085793495179,
408
+ "epoch": 1.6042780748663101,
409
+ "grad_norm": 0.8615349531173706,
410
+ "learning_rate": 7.388371407567565e-06,
411
+ "loss": 0.6532559967041016,
412
+ "mean_token_accuracy": 0.8388407498598098,
413
+ "num_tokens": 13166796.0,
414
+ "step": 900
415
+ },
416
+ {
417
+ "entropy": 0.6729245400428772,
418
+ "epoch": 1.6488413547237077,
419
+ "grad_norm": 0.831142783164978,
420
+ "learning_rate": 7.01659357121981e-06,
421
+ "loss": 0.6572090911865235,
422
+ "mean_token_accuracy": 0.8372052818536758,
423
+ "num_tokens": 13532499.0,
424
+ "step": 925
425
+ },
426
+ {
427
+ "entropy": 0.6538485777378082,
428
+ "epoch": 1.6934046345811051,
429
+ "grad_norm": 0.919346809387207,
430
+ "learning_rate": 6.6460054452899315e-06,
431
+ "loss": 0.6404708862304688,
432
+ "mean_token_accuracy": 0.8411308795213699,
433
+ "num_tokens": 13898404.0,
434
+ "step": 950
435
+ },
436
+ {
437
+ "entropy": 0.6691750481724739,
438
+ "epoch": 1.7379679144385025,
439
+ "grad_norm": 0.9280221462249756,
440
+ "learning_rate": 6.277519083337656e-06,
441
+ "loss": 0.6546466827392579,
442
+ "mean_token_accuracy": 0.838825848698616,
443
+ "num_tokens": 14261658.0,
444
+ "step": 975
445
+ },
446
+ {
447
+ "entropy": 0.6536609560251236,
448
+ "epoch": 1.7825311942959001,
449
+ "grad_norm": 0.9000495076179504,
450
+ "learning_rate": 5.9120413662763545e-06,
451
+ "loss": 0.6405950927734375,
452
+ "mean_token_accuracy": 0.8412596487998962,
453
+ "num_tokens": 14625008.0,
454
+ "step": 1000
455
+ },
456
+ {
457
+ "epoch": 1.7825311942959001,
458
+ "eval_entropy": 0.6716028243303299,
459
+ "eval_loss": 0.6561057567596436,
460
+ "eval_mean_token_accuracy": 0.8381222817897797,
461
+ "eval_num_tokens": 14625008.0,
462
+ "eval_runtime": 74.7617,
463
+ "eval_samples_per_second": 13.336,
464
+ "eval_steps_per_second": 3.344,
465
+ "step": 1000
466
+ },
467
+ {
468
+ "entropy": 0.6671841683983802,
469
+ "epoch": 1.8270944741532977,
470
+ "grad_norm": 0.8711400628089905,
471
+ "learning_rate": 5.550471770450572e-06,
472
+ "loss": 0.6500684356689453,
473
+ "mean_token_accuracy": 0.8389109486341476,
474
+ "num_tokens": 14985559.0,
475
+ "step": 1025
476
+ },
477
+ {
478
+ "entropy": 0.6568678751587868,
479
+ "epoch": 1.8716577540106951,
480
+ "grad_norm": 0.9135516285896301,
481
+ "learning_rate": 5.193700153936934e-06,
482
+ "loss": 0.6418634033203126,
483
+ "mean_token_accuracy": 0.8414819967746735,
484
+ "num_tokens": 15354311.0,
485
+ "step": 1050
486
+ },
487
+ {
488
+ "entropy": 0.6430006143450737,
489
+ "epoch": 1.9162210338680927,
490
+ "grad_norm": 0.9346958994865417,
491
+ "learning_rate": 4.842604566516537e-06,
492
+ "loss": 0.6278348541259766,
493
+ "mean_token_accuracy": 0.8434987276792526,
494
+ "num_tokens": 15721382.0,
495
+ "step": 1075
496
+ },
497
+ {
498
+ "entropy": 0.6387567144632339,
499
+ "epoch": 1.9607843137254903,
500
+ "grad_norm": 0.9693854451179504,
501
+ "learning_rate": 4.498049088708706e-06,
502
+ "loss": 0.6229427337646485,
503
+ "mean_token_accuracy": 0.8442350590229034,
504
+ "num_tokens": 16088038.0,
505
+ "step": 1100
506
+ },
507
+ {
508
+ "entropy": 0.6434592244029045,
509
+ "epoch": 2.0053475935828877,
510
+ "grad_norm": 0.9158383011817932,
511
+ "learning_rate": 4.160881705184478e-06,
512
+ "loss": 0.6287346649169921,
513
+ "mean_token_accuracy": 0.8434397971630097,
514
+ "num_tokens": 16448228.0,
515
+ "step": 1125
516
+ },
517
+ {
518
+ "entropy": 0.6293540370464324,
519
+ "epoch": 2.049910873440285,
520
+ "grad_norm": 0.9278510808944702,
521
+ "learning_rate": 3.831932217793526e-06,
522
+ "loss": 0.6089762115478515,
523
+ "mean_token_accuracy": 0.8473779886960984,
524
+ "num_tokens": 16812866.0,
525
+ "step": 1150
526
+ },
527
+ {
528
+ "entropy": 0.6246551343798638,
529
+ "epoch": 2.0944741532976825,
530
+ "grad_norm": 0.8729245066642761,
531
+ "learning_rate": 3.5120102033408053e-06,
532
+ "loss": 0.6066710281372071,
533
+ "mean_token_accuracy": 0.8471958756446838,
534
+ "num_tokens": 17177909.0,
535
+ "step": 1175
536
+ },
537
+ {
538
+ "entropy": 0.6269071605801583,
539
+ "epoch": 2.1390374331550803,
540
+ "grad_norm": 0.8709802031517029,
541
+ "learning_rate": 3.201903021138983e-06,
542
+ "loss": 0.6111587905883789,
543
+ "mean_token_accuracy": 0.8464664667844772,
544
+ "num_tokens": 17544377.0,
545
+ "step": 1200
546
+ },
547
+ {
548
+ "epoch": 2.1390374331550803,
549
+ "eval_entropy": 0.6344557646512985,
550
+ "eval_loss": 0.6462315320968628,
551
+ "eval_mean_token_accuracy": 0.8403205525875092,
552
+ "eval_num_tokens": 17544377.0,
553
+ "eval_runtime": 74.8344,
554
+ "eval_samples_per_second": 13.323,
555
+ "eval_steps_per_second": 3.341,
556
+ "step": 1200
557
+ },
558
+ {
559
+ "entropy": 0.617467094361782,
560
+ "epoch": 2.1836007130124777,
561
+ "grad_norm": 0.8771170973777771,
562
+ "learning_rate": 2.9023738752403013e-06,
563
+ "loss": 0.5986224746704102,
564
+ "mean_token_accuracy": 0.849560460448265,
565
+ "num_tokens": 17912855.0,
566
+ "step": 1225
567
+ },
568
+ {
569
+ "entropy": 0.6177873882651329,
570
+ "epoch": 2.228163992869875,
571
+ "grad_norm": 1.0253841876983643,
572
+ "learning_rate": 2.614159936116893e-06,
573
+ "loss": 0.5998103332519531,
574
+ "mean_token_accuracy": 0.8487882578372955,
575
+ "num_tokens": 18279476.0,
576
+ "step": 1250
577
+ },
578
+ {
579
+ "entropy": 0.6312283331155777,
580
+ "epoch": 2.2727272727272725,
581
+ "grad_norm": 0.9465038180351257,
582
+ "learning_rate": 2.337970526412267e-06,
583
+ "loss": 0.6118741226196289,
584
+ "mean_token_accuracy": 0.8458875006437302,
585
+ "num_tokens": 18644269.0,
586
+ "step": 1275
587
+ },
588
+ {
589
+ "entropy": 0.6209010258316994,
590
+ "epoch": 2.3172905525846703,
591
+ "grad_norm": 0.9807332158088684,
592
+ "learning_rate": 2.074485375229037e-06,
593
+ "loss": 0.6052029037475586,
594
+ "mean_token_accuracy": 0.8471564346551895,
595
+ "num_tokens": 19009107.0,
596
+ "step": 1300
597
+ },
598
+ {
599
+ "entropy": 0.6401337105035781,
600
+ "epoch": 2.3618538324420677,
601
+ "grad_norm": 1.0486506223678589,
602
+ "learning_rate": 1.82435294524924e-06,
603
+ "loss": 0.6207434463500977,
604
+ "mean_token_accuracy": 0.8439285135269166,
605
+ "num_tokens": 19374349.0,
606
+ "step": 1325
607
+ },
608
+ {
609
+ "entropy": 0.6109014016389847,
610
+ "epoch": 2.406417112299465,
611
+ "grad_norm": 0.9694714546203613,
612
+ "learning_rate": 1.5881888368043559e-06,
613
+ "loss": 0.5924215316772461,
614
+ "mean_token_accuracy": 0.8494464015960693,
615
+ "num_tokens": 19743047.0,
616
+ "step": 1350
617
+ },
618
+ {
619
+ "entropy": 0.6300237196683883,
620
+ "epoch": 2.450980392156863,
621
+ "grad_norm": 0.9961308836936951,
622
+ "learning_rate": 1.3665742728227932e-06,
623
+ "loss": 0.6133406066894531,
624
+ "mean_token_accuracy": 0.8462675029039383,
625
+ "num_tokens": 20105853.0,
626
+ "step": 1375
627
+ },
628
+ {
629
+ "entropy": 0.6148158556222916,
630
+ "epoch": 2.4955436720142603,
631
+ "grad_norm": 1.0224037170410156,
632
+ "learning_rate": 1.1600546683835065e-06,
633
+ "loss": 0.5978146362304687,
634
+ "mean_token_accuracy": 0.8488863033056259,
635
+ "num_tokens": 20469876.0,
636
+ "step": 1400
637
+ },
638
+ {
639
+ "epoch": 2.4955436720142603,
640
+ "eval_entropy": 0.627735008597374,
641
+ "eval_loss": 0.6408645510673523,
642
+ "eval_mean_token_accuracy": 0.8411034562587738,
643
+ "eval_num_tokens": 20469876.0,
644
+ "eval_runtime": 74.7955,
645
+ "eval_samples_per_second": 13.33,
646
+ "eval_steps_per_second": 3.342,
647
+ "step": 1400
648
+ },
649
+ {
650
+ "entropy": 0.6221208718419075,
651
+ "epoch": 2.5401069518716577,
652
+ "grad_norm": 1.0483691692352295,
653
+ "learning_rate": 9.691382883962515e-07,
654
+ "loss": 0.6043234634399414,
655
+ "mean_token_accuracy": 0.8475923782587051,
656
+ "num_tokens": 20834908.0,
657
+ "step": 1425
658
+ },
659
+ {
660
+ "entropy": 0.6163172733783722,
661
+ "epoch": 2.5846702317290555,
662
+ "grad_norm": 1.0169743299484253,
663
+ "learning_rate": 7.942949967120098e-07,
664
+ "loss": 0.6007443237304687,
665
+ "mean_token_accuracy": 0.8487154805660247,
666
+ "num_tokens": 21199575.0,
667
+ "step": 1450
668
+ },
669
+ {
670
+ "entropy": 0.6306376928091049,
671
+ "epoch": 2.629233511586453,
672
+ "grad_norm": 0.9749926328659058,
673
+ "learning_rate": 6.359550997421698e-07,
674
+ "loss": 0.6101107406616211,
675
+ "mean_token_accuracy": 0.8469714081287384,
676
+ "num_tokens": 21564414.0,
677
+ "step": 1475
678
+ },
679
+ {
680
+ "entropy": 0.6135326558351517,
681
+ "epoch": 2.6737967914438503,
682
+ "grad_norm": 1.0116835832595825,
683
+ "learning_rate": 4.945082874324541e-07,
684
+ "loss": 0.5956003189086914,
685
+ "mean_token_accuracy": 0.8500852519273758,
686
+ "num_tokens": 21928080.0,
687
+ "step": 1500
688
+ },
689
+ {
690
+ "entropy": 0.6165187922120094,
691
+ "epoch": 2.7183600713012477,
692
+ "grad_norm": 0.9928510785102844,
693
+ "learning_rate": 3.7030267419789764e-07,
694
+ "loss": 0.6013864135742187,
695
+ "mean_token_accuracy": 0.8494158619642258,
696
+ "num_tokens": 22296207.0,
697
+ "step": 1525
698
+ },
699
+ {
700
+ "entropy": 0.619775217473507,
701
+ "epoch": 2.762923351158645,
702
+ "grad_norm": 0.9901552796363831,
703
+ "learning_rate": 2.6364394217929856e-07,
704
+ "loss": 0.6034153366088867,
705
+ "mean_token_accuracy": 0.8480428576469421,
706
+ "num_tokens": 22661645.0,
707
+ "step": 1550
708
+ },
709
+ {
710
+ "entropy": 0.6086369237303734,
711
+ "epoch": 2.807486631016043,
712
+ "grad_norm": 0.8772838711738586,
713
+ "learning_rate": 1.7479458892961846e-07,
714
+ "loss": 0.5885520553588868,
715
+ "mean_token_accuracy": 0.8515380412340164,
716
+ "num_tokens": 23028234.0,
717
+ "step": 1575
718
+ },
719
+ {
720
+ "entropy": 0.6151553666591645,
721
+ "epoch": 2.8520499108734403,
722
+ "grad_norm": 0.9948622584342957,
723
+ "learning_rate": 1.0397328138187557e-07,
724
+ "loss": 0.5963270568847656,
725
+ "mean_token_accuracy": 0.8506602907180786,
726
+ "num_tokens": 23393799.0,
727
+ "step": 1600
728
+ },
729
+ {
730
+ "epoch": 2.8520499108734403,
731
+ "eval_entropy": 0.626643338561058,
732
+ "eval_loss": 0.6390902400016785,
733
+ "eval_mean_token_accuracy": 0.8414715526103973,
734
+ "eval_num_tokens": 23393799.0,
735
+ "eval_runtime": 74.7969,
736
+ "eval_samples_per_second": 13.329,
737
+ "eval_steps_per_second": 3.342,
738
+ "step": 1600
739
+ },
740
+ {
741
+ "entropy": 0.6176378938555718,
742
+ "epoch": 2.8966131907308377,
743
+ "grad_norm": 0.9002705216407776,
744
+ "learning_rate": 5.135431768847676e-08,
745
+ "loss": 0.5993848037719727,
746
+ "mean_token_accuracy": 0.8487631809711457,
747
+ "num_tokens": 23759121.0,
748
+ "step": 1625
749
+ },
750
+ {
751
+ "entropy": 0.6263965710997581,
752
+ "epoch": 2.9411764705882355,
753
+ "grad_norm": 0.9609607458114624,
754
+ "learning_rate": 1.7067198256442428e-08,
755
+ "loss": 0.6067921829223633,
756
+ "mean_token_accuracy": 0.8475446420907974,
757
+ "num_tokens": 24123974.0,
758
+ "step": 1650
759
+ },
760
+ {
761
+ "entropy": 0.6194088864326477,
762
+ "epoch": 2.985739750445633,
763
+ "grad_norm": 0.9630091190338135,
764
+ "learning_rate": 1.1963070342654869e-09,
765
+ "loss": 0.602127571105957,
766
+ "mean_token_accuracy": 0.8485302919149399,
767
+ "num_tokens": 24488475.0,
768
+ "step": 1675
769
+ }
770
+ ],
771
+ "logging_steps": 25,
772
+ "max_steps": 1683,
773
+ "num_input_tokens_seen": 0,
774
+ "num_train_epochs": 3,
775
+ "save_steps": 200,
776
+ "stateful_callbacks": {
777
+ "TrainerControl": {
778
+ "args": {
779
+ "should_epoch_stop": false,
780
+ "should_evaluate": false,
781
+ "should_log": false,
782
+ "should_save": true,
783
+ "should_training_stop": true
784
+ },
785
+ "attributes": {}
786
+ }
787
+ },
788
+ "total_flos": 1.1863540916795904e+18,
789
+ "train_batch_size": 4,
790
+ "trial_name": null,
791
+ "trial_params": null
792
+ }
checkpoint-1683/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33ef6c419df9522b8fc114339f82c3e390d52dabd314191546cdbca2a738c0af
3
+ size 5585
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8
3
+ size 11421892
tokenizer_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "backend": "tokenizers",
4
+ "bos_token": null,
5
+ "clean_up_tokenization_spaces": false,
6
+ "eos_token": "<|im_end|>",
7
+ "errors": "replace",
8
+ "extra_special_tokens": [
9
+ "<|im_start|>",
10
+ "<|im_end|>",
11
+ "<|object_ref_start|>",
12
+ "<|object_ref_end|>",
13
+ "<|box_start|>",
14
+ "<|box_end|>",
15
+ "<|quad_start|>",
16
+ "<|quad_end|>",
17
+ "<|vision_start|>",
18
+ "<|vision_end|>",
19
+ "<|vision_pad|>",
20
+ "<|image_pad|>",
21
+ "<|video_pad|>"
22
+ ],
23
+ "is_local": false,
24
+ "model_max_length": 131072,
25
+ "pad_token": "<|endoftext|>",
26
+ "padding_side": "right",
27
+ "split_special_tokens": false,
28
+ "tokenizer_class": "Qwen2Tokenizer",
29
+ "unk_token": null
30
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33ef6c419df9522b8fc114339f82c3e390d52dabd314191546cdbca2a738c0af
3
+ size 5585
training_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "Qwen/Qwen2.5-7B-Instruct",
3
+ "lora_rank": 64,
4
+ "lora_alpha": 128,
5
+ "epochs": 3,
6
+ "batch": 16,
7
+ "lr": 1.5e-05,
8
+ "train_size": 8973,
9
+ "val_size": 997,
10
+ "quant": "none (bf16)"
11
+ }