OvertureMMXXI commited on
Commit
5f4ed8e
·
verified ·
1 Parent(s): e81b2a4

Upload model

Browse files
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ checkpoint-2500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ checkpoint-2811/tokenizer.json filter=lfs diff=lfs merge=lfs -text
38
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen2.5-Coder-7B
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:Qwen/Qwen2.5-Coder-7B
7
+ - lora
8
+ - transformers
9
+ ---
10
+
11
+ # Model Card for Model ID
12
+
13
+ <!-- Provide a quick summary of what the model is/does. -->
14
+
15
+
16
+
17
+ ## Model Details
18
+
19
+ ### Model Description
20
+
21
+ <!-- Provide a longer summary of what this model is. -->
22
+
23
+
24
+
25
+ - **Developed by:** [More Information Needed]
26
+ - **Funded by [optional]:** [More Information Needed]
27
+ - **Shared by [optional]:** [More Information Needed]
28
+ - **Model type:** [More Information Needed]
29
+ - **Language(s) (NLP):** [More Information Needed]
30
+ - **License:** [More Information Needed]
31
+ - **Finetuned from model [optional]:** [More Information Needed]
32
+
33
+ ### Model Sources [optional]
34
+
35
+ <!-- Provide the basic links for the model. -->
36
+
37
+ - **Repository:** [More Information Needed]
38
+ - **Paper [optional]:** [More Information Needed]
39
+ - **Demo [optional]:** [More Information Needed]
40
+
41
+ ## Uses
42
+
43
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
44
+
45
+ ### Direct Use
46
+
47
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
48
+
49
+ [More Information Needed]
50
+
51
+ ### Downstream Use [optional]
52
+
53
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
54
+
55
+ [More Information Needed]
56
+
57
+ ### Out-of-Scope Use
58
+
59
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
60
+
61
+ [More Information Needed]
62
+
63
+ ## Bias, Risks, and Limitations
64
+
65
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
66
+
67
+ [More Information Needed]
68
+
69
+ ### Recommendations
70
+
71
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
72
+
73
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
74
+
75
+ ## How to Get Started with the Model
76
+
77
+ Use the code below to get started with the model.
78
+
79
+ [More Information Needed]
80
+
81
+ ## Training Details
82
+
83
+ ### Training Data
84
+
85
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
86
+
87
+ [More Information Needed]
88
+
89
+ ### Training Procedure
90
+
91
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
92
+
93
+ #### Preprocessing [optional]
94
+
95
+ [More Information Needed]
96
+
97
+
98
+ #### Training Hyperparameters
99
+
100
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
101
+
102
+ #### Speeds, Sizes, Times [optional]
103
+
104
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
105
+
106
+ [More Information Needed]
107
+
108
+ ## Evaluation
109
+
110
+ <!-- This section describes the evaluation protocols and provides the results. -->
111
+
112
+ ### Testing Data, Factors & Metrics
113
+
114
+ #### Testing Data
115
+
116
+ <!-- This should link to a Dataset Card if possible. -->
117
+
118
+ [More Information Needed]
119
+
120
+ #### Factors
121
+
122
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
123
+
124
+ [More Information Needed]
125
+
126
+ #### Metrics
127
+
128
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
129
+
130
+ [More Information Needed]
131
+
132
+ ### Results
133
+
134
+ [More Information Needed]
135
+
136
+ #### Summary
137
+
138
+
139
+
140
+ ## Model Examination [optional]
141
+
142
+ <!-- Relevant interpretability work for the model goes here -->
143
+
144
+ [More Information Needed]
145
+
146
+ ## Environmental Impact
147
+
148
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
149
+
150
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
151
+
152
+ - **Hardware Type:** [More Information Needed]
153
+ - **Hours used:** [More Information Needed]
154
+ - **Cloud Provider:** [More Information Needed]
155
+ - **Compute Region:** [More Information Needed]
156
+ - **Carbon Emitted:** [More Information Needed]
157
+
158
+ ## Technical Specifications [optional]
159
+
160
+ ### Model Architecture and Objective
161
+
162
+ [More Information Needed]
163
+
164
+ ### Compute Infrastructure
165
+
166
+ [More Information Needed]
167
+
168
+ #### Hardware
169
+
170
+ [More Information Needed]
171
+
172
+ #### Software
173
+
174
+ [More Information Needed]
175
+
176
+ ## Citation [optional]
177
+
178
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
179
+
180
+ **BibTeX:**
181
+
182
+ [More Information Needed]
183
+
184
+ **APA:**
185
+
186
+ [More Information Needed]
187
+
188
+ ## Glossary [optional]
189
+
190
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
191
+
192
+ [More Information Needed]
193
+
194
+ ## More Information [optional]
195
+
196
+ [More Information Needed]
197
+
198
+ ## Model Card Authors [optional]
199
+
200
+ [More Information Needed]
201
+
202
+ ## Model Card Contact
203
+
204
+ [More Information Needed]
205
+ ### Framework versions
206
+
207
+ - PEFT 0.18.1
adapter_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "Qwen/Qwen2.5-Coder-7B",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 16,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "v_proj",
33
+ "k_proj",
34
+ "o_proj",
35
+ "up_proj",
36
+ "down_proj",
37
+ "q_proj",
38
+ "gate_proj"
39
+ ],
40
+ "target_parameters": null,
41
+ "task_type": "CAUSAL_LM",
42
+ "trainable_token_indices": null,
43
+ "use_dora": false,
44
+ "use_qalora": false,
45
+ "use_rslora": false
46
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe40798da6a26060a9aa4921b95b61428d1a3f595c43c9ed0ad50f89466ebfdd
3
+ size 161533192
chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
checkpoint-2500/README.md ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen2.5-Coder-7B
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:Qwen/Qwen2.5-Coder-7B
7
+ - lora
8
+ - transformers
9
+ ---
10
+
11
+ # Model Card for Model ID
12
+
13
+ <!-- Provide a quick summary of what the model is/does. -->
14
+
15
+
16
+
17
+ ## Model Details
18
+
19
+ ### Model Description
20
+
21
+ <!-- Provide a longer summary of what this model is. -->
22
+
23
+
24
+
25
+ - **Developed by:** [More Information Needed]
26
+ - **Funded by [optional]:** [More Information Needed]
27
+ - **Shared by [optional]:** [More Information Needed]
28
+ - **Model type:** [More Information Needed]
29
+ - **Language(s) (NLP):** [More Information Needed]
30
+ - **License:** [More Information Needed]
31
+ - **Finetuned from model [optional]:** [More Information Needed]
32
+
33
+ ### Model Sources [optional]
34
+
35
+ <!-- Provide the basic links for the model. -->
36
+
37
+ - **Repository:** [More Information Needed]
38
+ - **Paper [optional]:** [More Information Needed]
39
+ - **Demo [optional]:** [More Information Needed]
40
+
41
+ ## Uses
42
+
43
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
44
+
45
+ ### Direct Use
46
+
47
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
48
+
49
+ [More Information Needed]
50
+
51
+ ### Downstream Use [optional]
52
+
53
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
54
+
55
+ [More Information Needed]
56
+
57
+ ### Out-of-Scope Use
58
+
59
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
60
+
61
+ [More Information Needed]
62
+
63
+ ## Bias, Risks, and Limitations
64
+
65
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
66
+
67
+ [More Information Needed]
68
+
69
+ ### Recommendations
70
+
71
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
72
+
73
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
74
+
75
+ ## How to Get Started with the Model
76
+
77
+ Use the code below to get started with the model.
78
+
79
+ [More Information Needed]
80
+
81
+ ## Training Details
82
+
83
+ ### Training Data
84
+
85
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
86
+
87
+ [More Information Needed]
88
+
89
+ ### Training Procedure
90
+
91
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
92
+
93
+ #### Preprocessing [optional]
94
+
95
+ [More Information Needed]
96
+
97
+
98
+ #### Training Hyperparameters
99
+
100
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
101
+
102
+ #### Speeds, Sizes, Times [optional]
103
+
104
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
105
+
106
+ [More Information Needed]
107
+
108
+ ## Evaluation
109
+
110
+ <!-- This section describes the evaluation protocols and provides the results. -->
111
+
112
+ ### Testing Data, Factors & Metrics
113
+
114
+ #### Testing Data
115
+
116
+ <!-- This should link to a Dataset Card if possible. -->
117
+
118
+ [More Information Needed]
119
+
120
+ #### Factors
121
+
122
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
123
+
124
+ [More Information Needed]
125
+
126
+ #### Metrics
127
+
128
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
129
+
130
+ [More Information Needed]
131
+
132
+ ### Results
133
+
134
+ [More Information Needed]
135
+
136
+ #### Summary
137
+
138
+
139
+
140
+ ## Model Examination [optional]
141
+
142
+ <!-- Relevant interpretability work for the model goes here -->
143
+
144
+ [More Information Needed]
145
+
146
+ ## Environmental Impact
147
+
148
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
149
+
150
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
151
+
152
+ - **Hardware Type:** [More Information Needed]
153
+ - **Hours used:** [More Information Needed]
154
+ - **Cloud Provider:** [More Information Needed]
155
+ - **Compute Region:** [More Information Needed]
156
+ - **Carbon Emitted:** [More Information Needed]
157
+
158
+ ## Technical Specifications [optional]
159
+
160
+ ### Model Architecture and Objective
161
+
162
+ [More Information Needed]
163
+
164
+ ### Compute Infrastructure
165
+
166
+ [More Information Needed]
167
+
168
+ #### Hardware
169
+
170
+ [More Information Needed]
171
+
172
+ #### Software
173
+
174
+ [More Information Needed]
175
+
176
+ ## Citation [optional]
177
+
178
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
179
+
180
+ **BibTeX:**
181
+
182
+ [More Information Needed]
183
+
184
+ **APA:**
185
+
186
+ [More Information Needed]
187
+
188
+ ## Glossary [optional]
189
+
190
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
191
+
192
+ [More Information Needed]
193
+
194
+ ## More Information [optional]
195
+
196
+ [More Information Needed]
197
+
198
+ ## Model Card Authors [optional]
199
+
200
+ [More Information Needed]
201
+
202
+ ## Model Card Contact
203
+
204
+ [More Information Needed]
205
+ ### Framework versions
206
+
207
+ - PEFT 0.18.1
checkpoint-2500/adapter_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "Qwen/Qwen2.5-Coder-7B",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 16,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "v_proj",
33
+ "k_proj",
34
+ "o_proj",
35
+ "up_proj",
36
+ "down_proj",
37
+ "q_proj",
38
+ "gate_proj"
39
+ ],
40
+ "target_parameters": null,
41
+ "task_type": "CAUSAL_LM",
42
+ "trainable_token_indices": null,
43
+ "use_dora": false,
44
+ "use_qalora": false,
45
+ "use_rslora": false
46
+ }
checkpoint-2500/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01e6ad54155eec5fc04dbb93a177330bdf7385c2d54c17c6294147d5ffe4f4ce
3
+ size 161533192
checkpoint-2500/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
checkpoint-2500/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d678c4b0d990cc6d48ff5375afd2912f9bd84dcb22fd04490eef4fdf821c1fcd
3
+ size 323291451
checkpoint-2500/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f73870a1ac4ce8310ca23b2bbba15c21fb0406393b8fcd96ccece173f6ea82b
3
+ size 14645
checkpoint-2500/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:623a525c3bd0ccda56e1c9b3d984f4bf52221143e2d8ffef8c0f5668fe3f0b46
3
+ size 1465
checkpoint-2500/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8
3
+ size 11421892
checkpoint-2500/tokenizer_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "backend": "tokenizers",
4
+ "bos_token": null,
5
+ "clean_up_tokenization_spaces": false,
6
+ "eos_token": "<|endoftext|>",
7
+ "errors": "replace",
8
+ "extra_special_tokens": [
9
+ "<|im_start|>",
10
+ "<|im_end|>",
11
+ "<|object_ref_start|>",
12
+ "<|object_ref_end|>",
13
+ "<|box_start|>",
14
+ "<|box_end|>",
15
+ "<|quad_start|>",
16
+ "<|quad_end|>",
17
+ "<|vision_start|>",
18
+ "<|vision_end|>",
19
+ "<|vision_pad|>",
20
+ "<|image_pad|>",
21
+ "<|video_pad|>"
22
+ ],
23
+ "is_local": false,
24
+ "model_max_length": 32768,
25
+ "pad_token": "<|endoftext|>",
26
+ "split_special_tokens": false,
27
+ "tokenizer_class": "Qwen2Tokenizer",
28
+ "unk_token": null
29
+ }
checkpoint-2500/trainer_state.json ADDED
@@ -0,0 +1,949 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.8896005693443644,
6
+ "eval_steps": 500,
7
+ "global_step": 2500,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.007116804554754915,
14
+ "grad_norm": 0.6649439334869385,
15
+ "learning_rate": 4.470588235294118e-05,
16
+ "loss": 1.1248294830322265,
17
+ "step": 20
18
+ },
19
+ {
20
+ "epoch": 0.01423360910950983,
21
+ "grad_norm": 0.5964099168777466,
22
+ "learning_rate": 9.176470588235295e-05,
23
+ "loss": 0.9246268272399902,
24
+ "step": 40
25
+ },
26
+ {
27
+ "epoch": 0.021350413664264746,
28
+ "grad_norm": 0.9485348463058472,
29
+ "learning_rate": 0.00013882352941176472,
30
+ "loss": 0.7519454002380371,
31
+ "step": 60
32
+ },
33
+ {
34
+ "epoch": 0.02846721821901966,
35
+ "grad_norm": 0.7866337299346924,
36
+ "learning_rate": 0.00018588235294117648,
37
+ "loss": 0.7462811946868897,
38
+ "step": 80
39
+ },
40
+ {
41
+ "epoch": 0.03558402277377457,
42
+ "grad_norm": 0.45363718271255493,
43
+ "learning_rate": 0.00019998698438490736,
44
+ "loss": 0.7312833309173584,
45
+ "step": 100
46
+ },
47
+ {
48
+ "epoch": 0.04270082732852949,
49
+ "grad_norm": 0.4268025755882263,
50
+ "learning_rate": 0.00019992324258963413,
51
+ "loss": 0.6707011222839355,
52
+ "step": 120
53
+ },
54
+ {
55
+ "epoch": 0.049817631883284405,
56
+ "grad_norm": 0.5609312057495117,
57
+ "learning_rate": 0.00019980641781070307,
58
+ "loss": 0.6737659931182861,
59
+ "step": 140
60
+ },
61
+ {
62
+ "epoch": 0.05693443643803932,
63
+ "grad_norm": 0.8989323377609253,
64
+ "learning_rate": 0.00019963657210982948,
65
+ "loss": 0.6452206134796142,
66
+ "step": 160
67
+ },
68
+ {
69
+ "epoch": 0.06405124099279423,
70
+ "grad_norm": 0.5717238187789917,
71
+ "learning_rate": 0.00019941379571543596,
72
+ "loss": 0.683911657333374,
73
+ "step": 180
74
+ },
75
+ {
76
+ "epoch": 0.07116804554754914,
77
+ "grad_norm": 0.6093301773071289,
78
+ "learning_rate": 0.00019913820697471985,
79
+ "loss": 0.6362697124481201,
80
+ "step": 200
81
+ },
82
+ {
83
+ "epoch": 0.07828485010230407,
84
+ "grad_norm": 0.7696208357810974,
85
+ "learning_rate": 0.0001988099522907825,
86
+ "loss": 0.7254255294799805,
87
+ "step": 220
88
+ },
89
+ {
90
+ "epoch": 0.08540165465705898,
91
+ "grad_norm": 0.5218358635902405,
92
+ "learning_rate": 0.00019842920604485473,
93
+ "loss": 0.6266093254089355,
94
+ "step": 240
95
+ },
96
+ {
97
+ "epoch": 0.0925184592118139,
98
+ "grad_norm": 1.2436918020248413,
99
+ "learning_rate": 0.0001979961705036587,
100
+ "loss": 0.602960729598999,
101
+ "step": 260
102
+ },
103
+ {
104
+ "epoch": 0.09963526376656881,
105
+ "grad_norm": 0.560608446598053,
106
+ "learning_rate": 0.00019751107571195638,
107
+ "loss": 0.6430336475372315,
108
+ "step": 280
109
+ },
110
+ {
111
+ "epoch": 0.10675206832132372,
112
+ "grad_norm": 0.5952504873275757,
113
+ "learning_rate": 0.00019697417937034105,
114
+ "loss": 0.6837223529815674,
115
+ "step": 300
116
+ },
117
+ {
118
+ "epoch": 0.11386887287607864,
119
+ "grad_norm": 0.6205160617828369,
120
+ "learning_rate": 0.00019638576669833718,
121
+ "loss": 0.6728087425231933,
122
+ "step": 320
123
+ },
124
+ {
125
+ "epoch": 0.12098567743083355,
126
+ "grad_norm": 0.880753755569458,
127
+ "learning_rate": 0.00019574615028288087,
128
+ "loss": 0.5986090660095215,
129
+ "step": 340
130
+ },
131
+ {
132
+ "epoch": 0.12810248198558846,
133
+ "grad_norm": 0.4881354570388794,
134
+ "learning_rate": 0.00019505566991226213,
135
+ "loss": 0.6425183773040771,
136
+ "step": 360
137
+ },
138
+ {
139
+ "epoch": 0.13521928654034338,
140
+ "grad_norm": 0.5412693619728088,
141
+ "learning_rate": 0.00019431469239561646,
142
+ "loss": 0.6049218654632569,
143
+ "step": 380
144
+ },
145
+ {
146
+ "epoch": 0.1423360910950983,
147
+ "grad_norm": 0.8913177847862244,
148
+ "learning_rate": 0.000193523611368062,
149
+ "loss": 0.6385027408599854,
150
+ "step": 400
151
+ },
152
+ {
153
+ "epoch": 0.14945289564985323,
154
+ "grad_norm": 0.7349528074264526,
155
+ "learning_rate": 0.0001926828470815859,
156
+ "loss": 0.5983442783355712,
157
+ "step": 420
158
+ },
159
+ {
160
+ "epoch": 0.15656970020460814,
161
+ "grad_norm": 0.9059110283851624,
162
+ "learning_rate": 0.00019179284618179055,
163
+ "loss": 0.6437891960144043,
164
+ "step": 440
165
+ },
166
+ {
167
+ "epoch": 0.16368650475936306,
168
+ "grad_norm": 0.46235358715057373,
169
+ "learning_rate": 0.0001908540814706187,
170
+ "loss": 0.6499705791473389,
171
+ "step": 460
172
+ },
173
+ {
174
+ "epoch": 0.17080330931411797,
175
+ "grad_norm": 0.48955702781677246,
176
+ "learning_rate": 0.00018986705165518317,
177
+ "loss": 0.6326710224151612,
178
+ "step": 480
179
+ },
180
+ {
181
+ "epoch": 0.17792011386887288,
182
+ "grad_norm": 0.8433415293693542,
183
+ "learning_rate": 0.0001888322810828351,
184
+ "loss": 0.6291649818420411,
185
+ "step": 500
186
+ },
187
+ {
188
+ "epoch": 0.17792011386887288,
189
+ "eval_loss": 0.6958565711975098,
190
+ "eval_runtime": 61.9418,
191
+ "eval_samples_per_second": 7.313,
192
+ "eval_steps_per_second": 0.92,
193
+ "step": 500
194
+ },
195
+ {
196
+ "epoch": 0.1850369184236278,
197
+ "grad_norm": 0.701486349105835,
198
+ "learning_rate": 0.00018775031946261064,
199
+ "loss": 0.5890274524688721,
200
+ "step": 520
201
+ },
202
+ {
203
+ "epoch": 0.1921537229783827,
204
+ "grad_norm": 0.8379756212234497,
205
+ "learning_rate": 0.00018662174157320512,
206
+ "loss": 0.5636214256286621,
207
+ "step": 540
208
+ },
209
+ {
210
+ "epoch": 0.19927052753313762,
211
+ "grad_norm": 0.9337961077690125,
212
+ "learning_rate": 0.0001854471469576289,
213
+ "loss": 0.6806112766265869,
214
+ "step": 560
215
+ },
216
+ {
217
+ "epoch": 0.20638733208789253,
218
+ "grad_norm": 0.7828953266143799,
219
+ "learning_rate": 0.00018422715960470738,
220
+ "loss": 0.5343317985534668,
221
+ "step": 580
222
+ },
223
+ {
224
+ "epoch": 0.21350413664264745,
225
+ "grad_norm": 0.5975359678268433,
226
+ "learning_rate": 0.00018296242761759498,
227
+ "loss": 0.6245263099670411,
228
+ "step": 600
229
+ },
230
+ {
231
+ "epoch": 0.22062094119740236,
232
+ "grad_norm": 0.6228912472724915,
233
+ "learning_rate": 0.00018165362286947815,
234
+ "loss": 0.661290979385376,
235
+ "step": 620
236
+ },
237
+ {
238
+ "epoch": 0.22773774575215727,
239
+ "grad_norm": 0.9198949933052063,
240
+ "learning_rate": 0.00018030144064665125,
241
+ "loss": 0.6715561866760253,
242
+ "step": 640
243
+ },
244
+ {
245
+ "epoch": 0.23485455030691219,
246
+ "grad_norm": 0.6074244976043701,
247
+ "learning_rate": 0.00017890659927915418,
248
+ "loss": 0.5627779006958008,
249
+ "step": 660
250
+ },
251
+ {
252
+ "epoch": 0.2419713548616671,
253
+ "grad_norm": 0.8562520146369934,
254
+ "learning_rate": 0.0001774698397591685,
255
+ "loss": 0.6850435256958007,
256
+ "step": 680
257
+ },
258
+ {
259
+ "epoch": 0.24908815941642204,
260
+ "grad_norm": 0.7890612483024597,
261
+ "learning_rate": 0.0001759919253473745,
262
+ "loss": 0.5971568584442138,
263
+ "step": 700
264
+ },
265
+ {
266
+ "epoch": 0.2562049639711769,
267
+ "grad_norm": 0.6242470741271973,
268
+ "learning_rate": 0.0001744736411674786,
269
+ "loss": 0.6526790618896484,
270
+ "step": 720
271
+ },
272
+ {
273
+ "epoch": 0.26332176852593187,
274
+ "grad_norm": 0.6237924098968506,
275
+ "learning_rate": 0.00017291579378912576,
276
+ "loss": 0.614622688293457,
277
+ "step": 740
278
+ },
279
+ {
280
+ "epoch": 0.27043857308068675,
281
+ "grad_norm": 1.250014066696167,
282
+ "learning_rate": 0.00017131921079941966,
283
+ "loss": 0.6886546611785889,
284
+ "step": 760
285
+ },
286
+ {
287
+ "epoch": 0.2775553776354417,
288
+ "grad_norm": 0.5347809791564941,
289
+ "learning_rate": 0.00016968474036327733,
290
+ "loss": 0.638498067855835,
291
+ "step": 780
292
+ },
293
+ {
294
+ "epoch": 0.2846721821901966,
295
+ "grad_norm": 0.7605909705162048,
296
+ "learning_rate": 0.0001680132507728518,
297
+ "loss": 0.6557466983795166,
298
+ "step": 800
299
+ },
300
+ {
301
+ "epoch": 0.2917889867449515,
302
+ "grad_norm": 0.647409975528717,
303
+ "learning_rate": 0.00016630562998626286,
304
+ "loss": 0.588128662109375,
305
+ "step": 820
306
+ },
307
+ {
308
+ "epoch": 0.29890579129970646,
309
+ "grad_norm": 0.5860188007354736,
310
+ "learning_rate": 0.00016456278515588024,
311
+ "loss": 0.6051214694976806,
312
+ "step": 840
313
+ },
314
+ {
315
+ "epoch": 0.30602259585446134,
316
+ "grad_norm": 0.875608503818512,
317
+ "learning_rate": 0.00016278564214641024,
318
+ "loss": 0.5963128089904786,
319
+ "step": 860
320
+ },
321
+ {
322
+ "epoch": 0.3131394004092163,
323
+ "grad_norm": 0.6618257164955139,
324
+ "learning_rate": 0.00016097514504304148,
325
+ "loss": 0.6630919933319092,
326
+ "step": 880
327
+ },
328
+ {
329
+ "epoch": 0.32025620496397117,
330
+ "grad_norm": 0.49601805210113525,
331
+ "learning_rate": 0.00015913225564991143,
332
+ "loss": 0.7227569103240967,
333
+ "step": 900
334
+ },
335
+ {
336
+ "epoch": 0.3273730095187261,
337
+ "grad_norm": 0.8401608467102051,
338
+ "learning_rate": 0.0001572579529791598,
339
+ "loss": 0.5685585975646973,
340
+ "step": 920
341
+ },
342
+ {
343
+ "epoch": 0.334489814073481,
344
+ "grad_norm": 0.5503961443901062,
345
+ "learning_rate": 0.00015535323273084062,
346
+ "loss": 0.6007286548614502,
347
+ "step": 940
348
+ },
349
+ {
350
+ "epoch": 0.34160661862823594,
351
+ "grad_norm": 0.8694468140602112,
352
+ "learning_rate": 0.0001534191067639688,
353
+ "loss": 0.5565601825714112,
354
+ "step": 960
355
+ },
356
+ {
357
+ "epoch": 0.3487234231829908,
358
+ "grad_norm": 0.5447700023651123,
359
+ "learning_rate": 0.00015145660255898262,
360
+ "loss": 0.5796232223510742,
361
+ "step": 980
362
+ },
363
+ {
364
+ "epoch": 0.35584022773774576,
365
+ "grad_norm": 0.7616499662399292,
366
+ "learning_rate": 0.00014946676267190752,
367
+ "loss": 0.5668922424316406,
368
+ "step": 1000
369
+ },
370
+ {
371
+ "epoch": 0.35584022773774576,
372
+ "eval_loss": 0.6924759149551392,
373
+ "eval_runtime": 61.6748,
374
+ "eval_samples_per_second": 7.345,
375
+ "eval_steps_per_second": 0.924,
376
+ "step": 1000
377
+ },
378
+ {
379
+ "epoch": 0.36295703229250065,
380
+ "grad_norm": 0.8111125230789185,
381
+ "learning_rate": 0.00014745064418051108,
382
+ "loss": 0.6030837535858155,
383
+ "step": 1020
384
+ },
385
+ {
386
+ "epoch": 0.3700738368472556,
387
+ "grad_norm": 0.5550377368927002,
388
+ "learning_rate": 0.00014540931812274358,
389
+ "loss": 0.6802701473236084,
390
+ "step": 1040
391
+ },
392
+ {
393
+ "epoch": 0.3771906414020105,
394
+ "grad_norm": 0.7379717230796814,
395
+ "learning_rate": 0.00014334386892776247,
396
+ "loss": 0.5119946956634521,
397
+ "step": 1060
398
+ },
399
+ {
400
+ "epoch": 0.3843074459567654,
401
+ "grad_norm": 0.6235134601593018,
402
+ "learning_rate": 0.00014125539383984264,
403
+ "loss": 0.606415843963623,
404
+ "step": 1080
405
+ },
406
+ {
407
+ "epoch": 0.3914242505115203,
408
+ "grad_norm": 1.4088711738586426,
409
+ "learning_rate": 0.00013914500233547908,
410
+ "loss": 0.6457336902618408,
411
+ "step": 1100
412
+ },
413
+ {
414
+ "epoch": 0.39854105506627524,
415
+ "grad_norm": 0.6606688499450684,
416
+ "learning_rate": 0.00013701381553399145,
417
+ "loss": 0.6700205326080322,
418
+ "step": 1120
419
+ },
420
+ {
421
+ "epoch": 0.4056578596210302,
422
+ "grad_norm": 0.6286507248878479,
423
+ "learning_rate": 0.0001348629656019429,
424
+ "loss": 0.5820858001708984,
425
+ "step": 1140
426
+ },
427
+ {
428
+ "epoch": 0.41277466417578507,
429
+ "grad_norm": 0.5580460429191589,
430
+ "learning_rate": 0.00013269359515169114,
431
+ "loss": 0.5733586311340332,
432
+ "step": 1160
433
+ },
434
+ {
435
+ "epoch": 0.41989146873054,
436
+ "grad_norm": 0.7134071588516235,
437
+ "learning_rate": 0.0001305068566343893,
438
+ "loss": 0.6202582359313965,
439
+ "step": 1180
440
+ },
441
+ {
442
+ "epoch": 0.4270082732852949,
443
+ "grad_norm": 0.7348489761352539,
444
+ "learning_rate": 0.000128303911727761,
445
+ "loss": 0.6173674583435058,
446
+ "step": 1200
447
+ },
448
+ {
449
+ "epoch": 0.43412507784004983,
450
+ "grad_norm": 1.1008996963500977,
451
+ "learning_rate": 0.0001260859307189731,
452
+ "loss": 0.61796555519104,
453
+ "step": 1220
454
+ },
455
+ {
456
+ "epoch": 0.4412418823948047,
457
+ "grad_norm": 0.9191972613334656,
458
+ "learning_rate": 0.0001238540918829353,
459
+ "loss": 0.6021127223968505,
460
+ "step": 1240
461
+ },
462
+ {
463
+ "epoch": 0.44835868694955966,
464
+ "grad_norm": 0.6578056216239929,
465
+ "learning_rate": 0.00012160958085635628,
466
+ "loss": 0.6018884658813477,
467
+ "step": 1260
468
+ },
469
+ {
470
+ "epoch": 0.45547549150431454,
471
+ "grad_norm": 0.6848057508468628,
472
+ "learning_rate": 0.00011935359000788873,
473
+ "loss": 0.5744600772857666,
474
+ "step": 1280
475
+ },
476
+ {
477
+ "epoch": 0.4625922960590695,
478
+ "grad_norm": 0.6195323467254639,
479
+ "learning_rate": 0.0001170873178046985,
480
+ "loss": 0.5539962768554687,
481
+ "step": 1300
482
+ },
483
+ {
484
+ "epoch": 0.46970910061382437,
485
+ "grad_norm": 0.6317474842071533,
486
+ "learning_rate": 0.00011481196817579352,
487
+ "loss": 0.5842368602752686,
488
+ "step": 1320
489
+ },
490
+ {
491
+ "epoch": 0.4768259051685793,
492
+ "grad_norm": 0.6639163494110107,
493
+ "learning_rate": 0.00011252874987245164,
494
+ "loss": 0.5572715759277344,
495
+ "step": 1340
496
+ },
497
+ {
498
+ "epoch": 0.4839427097233342,
499
+ "grad_norm": 0.519478976726532,
500
+ "learning_rate": 0.00011023887582608646,
501
+ "loss": 0.5544273376464843,
502
+ "step": 1360
503
+ },
504
+ {
505
+ "epoch": 0.49105951427808914,
506
+ "grad_norm": 0.7147398591041565,
507
+ "learning_rate": 0.0001079435625038925,
508
+ "loss": 0.5225166797637939,
509
+ "step": 1380
510
+ },
511
+ {
512
+ "epoch": 0.4981763188328441,
513
+ "grad_norm": 0.7628998160362244,
514
+ "learning_rate": 0.00010564402926261217,
515
+ "loss": 0.5659779071807861,
516
+ "step": 1400
517
+ },
518
+ {
519
+ "epoch": 0.505293123387599,
520
+ "grad_norm": 0.8272300958633423,
521
+ "learning_rate": 0.00010334149770076747,
522
+ "loss": 0.6124475955963135,
523
+ "step": 1420
524
+ },
525
+ {
526
+ "epoch": 0.5124099279423538,
527
+ "grad_norm": 0.6211256980895996,
528
+ "learning_rate": 0.00010103719100970115,
529
+ "loss": 0.5526745796203614,
530
+ "step": 1440
531
+ },
532
+ {
533
+ "epoch": 0.5195267324971088,
534
+ "grad_norm": 0.8070167899131775,
535
+ "learning_rate": 9.873233332377124e-05,
536
+ "loss": 0.5943079948425293,
537
+ "step": 1460
538
+ },
539
+ {
540
+ "epoch": 0.5266435370518637,
541
+ "grad_norm": 0.5424654483795166,
542
+ "learning_rate": 9.642814907004504e-05,
543
+ "loss": 0.5948707580566406,
544
+ "step": 1480
545
+ },
546
+ {
547
+ "epoch": 0.5337603416066187,
548
+ "grad_norm": 0.7003446817398071,
549
+ "learning_rate": 9.41258623178373e-05,
550
+ "loss": 0.6317638397216797,
551
+ "step": 1500
552
+ },
553
+ {
554
+ "epoch": 0.5337603416066187,
555
+ "eval_loss": 0.6935540437698364,
556
+ "eval_runtime": 61.6485,
557
+ "eval_samples_per_second": 7.348,
558
+ "eval_steps_per_second": 0.925,
559
+ "step": 1500
560
+ },
561
+ {
562
+ "epoch": 0.5408771461613735,
563
+ "grad_norm": 0.6322863101959229,
564
+ "learning_rate": 9.182669612843861e-05,
565
+ "loss": 0.665757417678833,
566
+ "step": 1520
567
+ },
568
+ {
569
+ "epoch": 0.5479939507161284,
570
+ "grad_norm": 0.6435247659683228,
571
+ "learning_rate": 8.953187190537928e-05,
572
+ "loss": 0.4951322078704834,
573
+ "step": 1540
574
+ },
575
+ {
576
+ "epoch": 0.5551107552708834,
577
+ "grad_norm": 0.8010550141334534,
578
+ "learning_rate": 8.724260874557384e-05,
579
+ "loss": 0.6802570343017578,
580
+ "step": 1560
581
+ },
582
+ {
583
+ "epoch": 0.5622275598256383,
584
+ "grad_norm": 0.8338032960891724,
585
+ "learning_rate": 8.496012279169097e-05,
586
+ "loss": 0.5899542331695556,
587
+ "step": 1580
588
+ },
589
+ {
590
+ "epoch": 0.5693443643803932,
591
+ "grad_norm": 0.7749608159065247,
592
+ "learning_rate": 8.268562658609254e-05,
593
+ "loss": 0.6368399620056152,
594
+ "step": 1600
595
+ },
596
+ {
597
+ "epoch": 0.5764611689351481,
598
+ "grad_norm": 0.7659400105476379,
599
+ "learning_rate": 8.042032842668596e-05,
600
+ "loss": 0.5484944820404053,
601
+ "step": 1620
602
+ },
603
+ {
604
+ "epoch": 0.583577973489903,
605
+ "grad_norm": 1.1483023166656494,
606
+ "learning_rate": 7.816543172503053e-05,
607
+ "loss": 0.554067325592041,
608
+ "step": 1640
609
+ },
610
+ {
611
+ "epoch": 0.590694778044658,
612
+ "grad_norm": 0.6539034247398376,
613
+ "learning_rate": 7.592213436704003e-05,
614
+ "loss": 0.5624193668365478,
615
+ "step": 1660
616
+ },
617
+ {
618
+ "epoch": 0.5978115825994129,
619
+ "grad_norm": 0.6587591767311096,
620
+ "learning_rate": 7.369162807662087e-05,
621
+ "loss": 0.6478344917297363,
622
+ "step": 1680
623
+ },
624
+ {
625
+ "epoch": 0.6049283871541677,
626
+ "grad_norm": 0.5699108839035034,
627
+ "learning_rate": 7.147509778258334e-05,
628
+ "loss": 0.6776344299316406,
629
+ "step": 1700
630
+ },
631
+ {
632
+ "epoch": 0.6120451917089227,
633
+ "grad_norm": 0.6976863145828247,
634
+ "learning_rate": 6.927372098916294e-05,
635
+ "loss": 0.615149450302124,
636
+ "step": 1720
637
+ },
638
+ {
639
+ "epoch": 0.6191619962636776,
640
+ "grad_norm": 0.7388312220573425,
641
+ "learning_rate": 6.708866715048585e-05,
642
+ "loss": 0.6845808982849121,
643
+ "step": 1740
644
+ },
645
+ {
646
+ "epoch": 0.6262788008184326,
647
+ "grad_norm": 0.5257291793823242,
648
+ "learning_rate": 6.492109704931101e-05,
649
+ "loss": 0.6309232234954834,
650
+ "step": 1760
651
+ },
652
+ {
653
+ "epoch": 0.6333956053731874,
654
+ "grad_norm": 0.7461546063423157,
655
+ "learning_rate": 6.277216218037849e-05,
656
+ "loss": 0.6485635757446289,
657
+ "step": 1780
658
+ },
659
+ {
660
+ "epoch": 0.6405124099279423,
661
+ "grad_norm": 0.7209606766700745,
662
+ "learning_rate": 6.0643004138692375e-05,
663
+ "loss": 0.6348707675933838,
664
+ "step": 1800
665
+ },
666
+ {
667
+ "epoch": 0.6476292144826973,
668
+ "grad_norm": 0.6697937250137329,
669
+ "learning_rate": 5.853475401306241e-05,
670
+ "loss": 0.5393397331237793,
671
+ "step": 1820
672
+ },
673
+ {
674
+ "epoch": 0.6547460190374522,
675
+ "grad_norm": 0.6282167434692383,
676
+ "learning_rate": 5.644853178522733e-05,
677
+ "loss": 0.7485171794891358,
678
+ "step": 1840
679
+ },
680
+ {
681
+ "epoch": 0.661862823592207,
682
+ "grad_norm": 0.6313266158103943,
683
+ "learning_rate": 5.438544573487811e-05,
684
+ "loss": 0.5625770092010498,
685
+ "step": 1860
686
+ },
687
+ {
688
+ "epoch": 0.668979628146962,
689
+ "grad_norm": 0.6948370933532715,
690
+ "learning_rate": 5.23465918508984e-05,
691
+ "loss": 0.6276655197143555,
692
+ "step": 1880
693
+ },
694
+ {
695
+ "epoch": 0.6760964327017169,
696
+ "grad_norm": 0.6330398321151733,
697
+ "learning_rate": 5.0333053249133924e-05,
698
+ "loss": 0.541553258895874,
699
+ "step": 1900
700
+ },
701
+ {
702
+ "epoch": 0.6832132372564719,
703
+ "grad_norm": 0.7645998001098633,
704
+ "learning_rate": 4.834589959700061e-05,
705
+ "loss": 0.5345258235931396,
706
+ "step": 1920
707
+ },
708
+ {
709
+ "epoch": 0.6903300418112268,
710
+ "grad_norm": 0.6248979568481445,
711
+ "learning_rate": 4.6386186545237054e-05,
712
+ "loss": 0.6255132675170898,
713
+ "step": 1940
714
+ },
715
+ {
716
+ "epoch": 0.6974468463659816,
717
+ "grad_norm": 0.8874566555023193,
718
+ "learning_rate": 4.445495516710312e-05,
719
+ "loss": 0.5986682891845703,
720
+ "step": 1960
721
+ },
722
+ {
723
+ "epoch": 0.7045636509207366,
724
+ "grad_norm": 0.7236739993095398,
725
+ "learning_rate": 4.2553231405322724e-05,
726
+ "loss": 0.6396020412445068,
727
+ "step": 1980
728
+ },
729
+ {
730
+ "epoch": 0.7116804554754915,
731
+ "grad_norm": 0.57773357629776,
732
+ "learning_rate": 4.0682025527064486e-05,
733
+ "loss": 0.5397399425506592,
734
+ "step": 2000
735
+ },
736
+ {
737
+ "epoch": 0.7116804554754915,
738
+ "eval_loss": 0.6902480721473694,
739
+ "eval_runtime": 61.8578,
740
+ "eval_samples_per_second": 7.323,
741
+ "eval_steps_per_second": 0.921,
742
+ "step": 2000
743
+ },
744
+ {
745
+ "epoch": 0.7187972600302465,
746
+ "grad_norm": 0.9456912875175476,
747
+ "learning_rate": 3.8842331587249756e-05,
748
+ "loss": 0.5803652286529541,
749
+ "step": 2020
750
+ },
751
+ {
752
+ "epoch": 0.7259140645850013,
753
+ "grad_norm": 0.6886982321739197,
754
+ "learning_rate": 3.703512690047336e-05,
755
+ "loss": 0.6748288154602051,
756
+ "step": 2040
757
+ },
758
+ {
759
+ "epoch": 0.7330308691397562,
760
+ "grad_norm": 0.6338731050491333,
761
+ "learning_rate": 3.5261371521817244e-05,
762
+ "loss": 0.47889223098754885,
763
+ "step": 2060
764
+ },
765
+ {
766
+ "epoch": 0.7401476736945112,
767
+ "grad_norm": 0.6906462907791138,
768
+ "learning_rate": 3.352200773683317e-05,
769
+ "loss": 0.6045923233032227,
770
+ "step": 2080
771
+ },
772
+ {
773
+ "epoch": 0.7472644782492661,
774
+ "grad_norm": 0.8363668322563171,
775
+ "learning_rate": 3.1817959560965215e-05,
776
+ "loss": 0.5399829387664795,
777
+ "step": 2100
778
+ },
779
+ {
780
+ "epoch": 0.754381282804021,
781
+ "grad_norm": 0.5785157680511475,
782
+ "learning_rate": 3.0150132248677976e-05,
783
+ "loss": 0.64073805809021,
784
+ "step": 2120
785
+ },
786
+ {
787
+ "epoch": 0.7614980873587759,
788
+ "grad_norm": 0.7285847663879395,
789
+ "learning_rate": 2.8519411812551388e-05,
790
+ "loss": 0.6770923137664795,
791
+ "step": 2140
792
+ },
793
+ {
794
+ "epoch": 0.7686148919135308,
795
+ "grad_norm": 0.5978537201881409,
796
+ "learning_rate": 2.6926664552597535e-05,
797
+ "loss": 0.6357324123382568,
798
+ "step": 2160
799
+ },
800
+ {
801
+ "epoch": 0.7757316964682858,
802
+ "grad_norm": 0.5783416032791138,
803
+ "learning_rate": 2.5372736596049418e-05,
804
+ "loss": 0.5521987438201904,
805
+ "step": 2180
806
+ },
807
+ {
808
+ "epoch": 0.7828485010230406,
809
+ "grad_norm": 0.6857144236564636,
810
+ "learning_rate": 2.3858453447866557e-05,
811
+ "loss": 0.6403320789337158,
812
+ "step": 2200
813
+ },
814
+ {
815
+ "epoch": 0.7899653055777955,
816
+ "grad_norm": 0.6665685772895813,
817
+ "learning_rate": 2.2384619552195518e-05,
818
+ "loss": 0.5500887393951416,
819
+ "step": 2220
820
+ },
821
+ {
822
+ "epoch": 0.7970821101325505,
823
+ "grad_norm": 0.6291202306747437,
824
+ "learning_rate": 2.0952017865019036e-05,
825
+ "loss": 0.5161121368408204,
826
+ "step": 2240
827
+ },
828
+ {
829
+ "epoch": 0.8041989146873054,
830
+ "grad_norm": 0.487289696931839,
831
+ "learning_rate": 1.9561409438220247e-05,
832
+ "loss": 0.547119140625,
833
+ "step": 2260
834
+ },
835
+ {
836
+ "epoch": 0.8113157192420604,
837
+ "grad_norm": 0.6173009872436523,
838
+ "learning_rate": 1.8213533015283525e-05,
839
+ "loss": 0.6167063236236572,
840
+ "step": 2280
841
+ },
842
+ {
843
+ "epoch": 0.8184325237968152,
844
+ "grad_norm": 0.7068958282470703,
845
+ "learning_rate": 1.6909104638845986e-05,
846
+ "loss": 0.5167754173278809,
847
+ "step": 2300
848
+ },
849
+ {
850
+ "epoch": 0.8255493283515701,
851
+ "grad_norm": 0.8526712656021118,
852
+ "learning_rate": 1.5648817270308645e-05,
853
+ "loss": 0.5217617511749267,
854
+ "step": 2320
855
+ },
856
+ {
857
+ "epoch": 0.8326661329063251,
858
+ "grad_norm": 0.7311387658119202,
859
+ "learning_rate": 1.4433340421709595e-05,
860
+ "loss": 0.6144563674926757,
861
+ "step": 2340
862
+ },
863
+ {
864
+ "epoch": 0.83978293746108,
865
+ "grad_norm": 0.5354213118553162,
866
+ "learning_rate": 1.3263319800053697e-05,
867
+ "loss": 0.6949819087982178,
868
+ "step": 2360
869
+ },
870
+ {
871
+ "epoch": 0.8468997420158348,
872
+ "grad_norm": 0.7716789841651917,
873
+ "learning_rate": 1.2139376964288852e-05,
874
+ "loss": 0.5799826145172119,
875
+ "step": 2380
876
+ },
877
+ {
878
+ "epoch": 0.8540165465705898,
879
+ "grad_norm": 0.7061272263526917,
880
+ "learning_rate": 1.1062108995110565e-05,
881
+ "loss": 0.5493000030517579,
882
+ "step": 2400
883
+ },
884
+ {
885
+ "epoch": 0.8611333511253447,
886
+ "grad_norm": 0.49559611082077026,
887
+ "learning_rate": 1.003208817777025e-05,
888
+ "loss": 0.5216735363006592,
889
+ "step": 2420
890
+ },
891
+ {
892
+ "epoch": 0.8682501556800997,
893
+ "grad_norm": 0.737585186958313,
894
+ "learning_rate": 9.049861698055694e-06,
895
+ "loss": 0.5979462623596191,
896
+ "step": 2440
897
+ },
898
+ {
899
+ "epoch": 0.8753669602348545,
900
+ "grad_norm": 0.745577871799469,
901
+ "learning_rate": 8.115951351605378e-06,
902
+ "loss": 0.5613039016723633,
903
+ "step": 2460
904
+ },
905
+ {
906
+ "epoch": 0.8824837647896094,
907
+ "grad_norm": 0.686896026134491,
908
+ "learning_rate": 7.230853266711124e-06,
909
+ "loss": 0.7399949073791504,
910
+ "step": 2480
911
+ },
912
+ {
913
+ "epoch": 0.8896005693443644,
914
+ "grad_norm": 0.47512194514274597,
915
+ "learning_rate": 6.395037640756074e-06,
916
+ "loss": 0.6489872455596923,
917
+ "step": 2500
918
+ },
919
+ {
920
+ "epoch": 0.8896005693443644,
921
+ "eval_loss": 0.6895355582237244,
922
+ "eval_runtime": 61.7594,
923
+ "eval_samples_per_second": 7.335,
924
+ "eval_steps_per_second": 0.923,
925
+ "step": 2500
926
+ }
927
+ ],
928
+ "logging_steps": 20,
929
+ "max_steps": 2811,
930
+ "num_input_tokens_seen": 0,
931
+ "num_train_epochs": 1,
932
+ "save_steps": 500,
933
+ "stateful_callbacks": {
934
+ "TrainerControl": {
935
+ "args": {
936
+ "should_epoch_stop": false,
937
+ "should_evaluate": false,
938
+ "should_log": false,
939
+ "should_save": true,
940
+ "should_training_stop": false
941
+ },
942
+ "attributes": {}
943
+ }
944
+ },
945
+ "total_flos": 1.603670195866411e+18,
946
+ "train_batch_size": 1,
947
+ "trial_name": null,
948
+ "trial_params": null
949
+ }
checkpoint-2500/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c08bb00e1e243b1f9e9b6645a8d5bd488ad57e7beb1e85e7b7cf4fda61f49e32
3
+ size 5137
checkpoint-2811/README.md ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen2.5-Coder-7B
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:Qwen/Qwen2.5-Coder-7B
7
+ - lora
8
+ - transformers
9
+ ---
10
+
11
+ # Model Card for Model ID
12
+
13
+ <!-- Provide a quick summary of what the model is/does. -->
14
+
15
+
16
+
17
+ ## Model Details
18
+
19
+ ### Model Description
20
+
21
+ <!-- Provide a longer summary of what this model is. -->
22
+
23
+
24
+
25
+ - **Developed by:** [More Information Needed]
26
+ - **Funded by [optional]:** [More Information Needed]
27
+ - **Shared by [optional]:** [More Information Needed]
28
+ - **Model type:** [More Information Needed]
29
+ - **Language(s) (NLP):** [More Information Needed]
30
+ - **License:** [More Information Needed]
31
+ - **Finetuned from model [optional]:** [More Information Needed]
32
+
33
+ ### Model Sources [optional]
34
+
35
+ <!-- Provide the basic links for the model. -->
36
+
37
+ - **Repository:** [More Information Needed]
38
+ - **Paper [optional]:** [More Information Needed]
39
+ - **Demo [optional]:** [More Information Needed]
40
+
41
+ ## Uses
42
+
43
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
44
+
45
+ ### Direct Use
46
+
47
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
48
+
49
+ [More Information Needed]
50
+
51
+ ### Downstream Use [optional]
52
+
53
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
54
+
55
+ [More Information Needed]
56
+
57
+ ### Out-of-Scope Use
58
+
59
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
60
+
61
+ [More Information Needed]
62
+
63
+ ## Bias, Risks, and Limitations
64
+
65
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
66
+
67
+ [More Information Needed]
68
+
69
+ ### Recommendations
70
+
71
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
72
+
73
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
74
+
75
+ ## How to Get Started with the Model
76
+
77
+ Use the code below to get started with the model.
78
+
79
+ [More Information Needed]
80
+
81
+ ## Training Details
82
+
83
+ ### Training Data
84
+
85
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
86
+
87
+ [More Information Needed]
88
+
89
+ ### Training Procedure
90
+
91
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
92
+
93
+ #### Preprocessing [optional]
94
+
95
+ [More Information Needed]
96
+
97
+
98
+ #### Training Hyperparameters
99
+
100
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
101
+
102
+ #### Speeds, Sizes, Times [optional]
103
+
104
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
105
+
106
+ [More Information Needed]
107
+
108
+ ## Evaluation
109
+
110
+ <!-- This section describes the evaluation protocols and provides the results. -->
111
+
112
+ ### Testing Data, Factors & Metrics
113
+
114
+ #### Testing Data
115
+
116
+ <!-- This should link to a Dataset Card if possible. -->
117
+
118
+ [More Information Needed]
119
+
120
+ #### Factors
121
+
122
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
123
+
124
+ [More Information Needed]
125
+
126
+ #### Metrics
127
+
128
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
129
+
130
+ [More Information Needed]
131
+
132
+ ### Results
133
+
134
+ [More Information Needed]
135
+
136
+ #### Summary
137
+
138
+
139
+
140
+ ## Model Examination [optional]
141
+
142
+ <!-- Relevant interpretability work for the model goes here -->
143
+
144
+ [More Information Needed]
145
+
146
+ ## Environmental Impact
147
+
148
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
149
+
150
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
151
+
152
+ - **Hardware Type:** [More Information Needed]
153
+ - **Hours used:** [More Information Needed]
154
+ - **Cloud Provider:** [More Information Needed]
155
+ - **Compute Region:** [More Information Needed]
156
+ - **Carbon Emitted:** [More Information Needed]
157
+
158
+ ## Technical Specifications [optional]
159
+
160
+ ### Model Architecture and Objective
161
+
162
+ [More Information Needed]
163
+
164
+ ### Compute Infrastructure
165
+
166
+ [More Information Needed]
167
+
168
+ #### Hardware
169
+
170
+ [More Information Needed]
171
+
172
+ #### Software
173
+
174
+ [More Information Needed]
175
+
176
+ ## Citation [optional]
177
+
178
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
179
+
180
+ **BibTeX:**
181
+
182
+ [More Information Needed]
183
+
184
+ **APA:**
185
+
186
+ [More Information Needed]
187
+
188
+ ## Glossary [optional]
189
+
190
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
191
+
192
+ [More Information Needed]
193
+
194
+ ## More Information [optional]
195
+
196
+ [More Information Needed]
197
+
198
+ ## Model Card Authors [optional]
199
+
200
+ [More Information Needed]
201
+
202
+ ## Model Card Contact
203
+
204
+ [More Information Needed]
205
+ ### Framework versions
206
+
207
+ - PEFT 0.18.1
checkpoint-2811/adapter_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "Qwen/Qwen2.5-Coder-7B",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 16,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "v_proj",
33
+ "k_proj",
34
+ "o_proj",
35
+ "up_proj",
36
+ "down_proj",
37
+ "q_proj",
38
+ "gate_proj"
39
+ ],
40
+ "target_parameters": null,
41
+ "task_type": "CAUSAL_LM",
42
+ "trainable_token_indices": null,
43
+ "use_dora": false,
44
+ "use_qalora": false,
45
+ "use_rslora": false
46
+ }
checkpoint-2811/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe40798da6a26060a9aa4921b95b61428d1a3f595c43c9ed0ad50f89466ebfdd
3
+ size 161533192
checkpoint-2811/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
checkpoint-2811/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0067a2c4c27bb9b002f68d87f7d330dead44904981f5cc9ef410895b60c584b0
3
+ size 323291451
checkpoint-2811/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f2666db077b43e301571b90bacda0467858f9f4446736ccc411c44634870871
3
+ size 14645
checkpoint-2811/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0d57218568dac09c5c800c9e18599a9a2a30e472441d9995ecc4b00bd4ebd57
3
+ size 1465
checkpoint-2811/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8
3
+ size 11421892
checkpoint-2811/tokenizer_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "backend": "tokenizers",
4
+ "bos_token": null,
5
+ "clean_up_tokenization_spaces": false,
6
+ "eos_token": "<|endoftext|>",
7
+ "errors": "replace",
8
+ "extra_special_tokens": [
9
+ "<|im_start|>",
10
+ "<|im_end|>",
11
+ "<|object_ref_start|>",
12
+ "<|object_ref_end|>",
13
+ "<|box_start|>",
14
+ "<|box_end|>",
15
+ "<|quad_start|>",
16
+ "<|quad_end|>",
17
+ "<|vision_start|>",
18
+ "<|vision_end|>",
19
+ "<|vision_pad|>",
20
+ "<|image_pad|>",
21
+ "<|video_pad|>"
22
+ ],
23
+ "is_local": false,
24
+ "model_max_length": 32768,
25
+ "pad_token": "<|endoftext|>",
26
+ "split_special_tokens": false,
27
+ "tokenizer_class": "Qwen2Tokenizer",
28
+ "unk_token": null
29
+ }
checkpoint-2811/trainer_state.json ADDED
@@ -0,0 +1,1054 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.0,
6
+ "eval_steps": 500,
7
+ "global_step": 2811,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.007116804554754915,
14
+ "grad_norm": 0.6649439334869385,
15
+ "learning_rate": 4.470588235294118e-05,
16
+ "loss": 1.1248294830322265,
17
+ "step": 20
18
+ },
19
+ {
20
+ "epoch": 0.01423360910950983,
21
+ "grad_norm": 0.5964099168777466,
22
+ "learning_rate": 9.176470588235295e-05,
23
+ "loss": 0.9246268272399902,
24
+ "step": 40
25
+ },
26
+ {
27
+ "epoch": 0.021350413664264746,
28
+ "grad_norm": 0.9485348463058472,
29
+ "learning_rate": 0.00013882352941176472,
30
+ "loss": 0.7519454002380371,
31
+ "step": 60
32
+ },
33
+ {
34
+ "epoch": 0.02846721821901966,
35
+ "grad_norm": 0.7866337299346924,
36
+ "learning_rate": 0.00018588235294117648,
37
+ "loss": 0.7462811946868897,
38
+ "step": 80
39
+ },
40
+ {
41
+ "epoch": 0.03558402277377457,
42
+ "grad_norm": 0.45363718271255493,
43
+ "learning_rate": 0.00019998698438490736,
44
+ "loss": 0.7312833309173584,
45
+ "step": 100
46
+ },
47
+ {
48
+ "epoch": 0.04270082732852949,
49
+ "grad_norm": 0.4268025755882263,
50
+ "learning_rate": 0.00019992324258963413,
51
+ "loss": 0.6707011222839355,
52
+ "step": 120
53
+ },
54
+ {
55
+ "epoch": 0.049817631883284405,
56
+ "grad_norm": 0.5609312057495117,
57
+ "learning_rate": 0.00019980641781070307,
58
+ "loss": 0.6737659931182861,
59
+ "step": 140
60
+ },
61
+ {
62
+ "epoch": 0.05693443643803932,
63
+ "grad_norm": 0.8989323377609253,
64
+ "learning_rate": 0.00019963657210982948,
65
+ "loss": 0.6452206134796142,
66
+ "step": 160
67
+ },
68
+ {
69
+ "epoch": 0.06405124099279423,
70
+ "grad_norm": 0.5717238187789917,
71
+ "learning_rate": 0.00019941379571543596,
72
+ "loss": 0.683911657333374,
73
+ "step": 180
74
+ },
75
+ {
76
+ "epoch": 0.07116804554754914,
77
+ "grad_norm": 0.6093301773071289,
78
+ "learning_rate": 0.00019913820697471985,
79
+ "loss": 0.6362697124481201,
80
+ "step": 200
81
+ },
82
+ {
83
+ "epoch": 0.07828485010230407,
84
+ "grad_norm": 0.7696208357810974,
85
+ "learning_rate": 0.0001988099522907825,
86
+ "loss": 0.7254255294799805,
87
+ "step": 220
88
+ },
89
+ {
90
+ "epoch": 0.08540165465705898,
91
+ "grad_norm": 0.5218358635902405,
92
+ "learning_rate": 0.00019842920604485473,
93
+ "loss": 0.6266093254089355,
94
+ "step": 240
95
+ },
96
+ {
97
+ "epoch": 0.0925184592118139,
98
+ "grad_norm": 1.2436918020248413,
99
+ "learning_rate": 0.0001979961705036587,
100
+ "loss": 0.602960729598999,
101
+ "step": 260
102
+ },
103
+ {
104
+ "epoch": 0.09963526376656881,
105
+ "grad_norm": 0.560608446598053,
106
+ "learning_rate": 0.00019751107571195638,
107
+ "loss": 0.6430336475372315,
108
+ "step": 280
109
+ },
110
+ {
111
+ "epoch": 0.10675206832132372,
112
+ "grad_norm": 0.5952504873275757,
113
+ "learning_rate": 0.00019697417937034105,
114
+ "loss": 0.6837223529815674,
115
+ "step": 300
116
+ },
117
+ {
118
+ "epoch": 0.11386887287607864,
119
+ "grad_norm": 0.6205160617828369,
120
+ "learning_rate": 0.00019638576669833718,
121
+ "loss": 0.6728087425231933,
122
+ "step": 320
123
+ },
124
+ {
125
+ "epoch": 0.12098567743083355,
126
+ "grad_norm": 0.880753755569458,
127
+ "learning_rate": 0.00019574615028288087,
128
+ "loss": 0.5986090660095215,
129
+ "step": 340
130
+ },
131
+ {
132
+ "epoch": 0.12810248198558846,
133
+ "grad_norm": 0.4881354570388794,
134
+ "learning_rate": 0.00019505566991226213,
135
+ "loss": 0.6425183773040771,
136
+ "step": 360
137
+ },
138
+ {
139
+ "epoch": 0.13521928654034338,
140
+ "grad_norm": 0.5412693619728088,
141
+ "learning_rate": 0.00019431469239561646,
142
+ "loss": 0.6049218654632569,
143
+ "step": 380
144
+ },
145
+ {
146
+ "epoch": 0.1423360910950983,
147
+ "grad_norm": 0.8913177847862244,
148
+ "learning_rate": 0.000193523611368062,
149
+ "loss": 0.6385027408599854,
150
+ "step": 400
151
+ },
152
+ {
153
+ "epoch": 0.14945289564985323,
154
+ "grad_norm": 0.7349528074264526,
155
+ "learning_rate": 0.0001926828470815859,
156
+ "loss": 0.5983442783355712,
157
+ "step": 420
158
+ },
159
+ {
160
+ "epoch": 0.15656970020460814,
161
+ "grad_norm": 0.9059110283851624,
162
+ "learning_rate": 0.00019179284618179055,
163
+ "loss": 0.6437891960144043,
164
+ "step": 440
165
+ },
166
+ {
167
+ "epoch": 0.16368650475936306,
168
+ "grad_norm": 0.46235358715057373,
169
+ "learning_rate": 0.0001908540814706187,
170
+ "loss": 0.6499705791473389,
171
+ "step": 460
172
+ },
173
+ {
174
+ "epoch": 0.17080330931411797,
175
+ "grad_norm": 0.48955702781677246,
176
+ "learning_rate": 0.00018986705165518317,
177
+ "loss": 0.6326710224151612,
178
+ "step": 480
179
+ },
180
+ {
181
+ "epoch": 0.17792011386887288,
182
+ "grad_norm": 0.8433415293693542,
183
+ "learning_rate": 0.0001888322810828351,
184
+ "loss": 0.6291649818420411,
185
+ "step": 500
186
+ },
187
+ {
188
+ "epoch": 0.17792011386887288,
189
+ "eval_loss": 0.6958565711975098,
190
+ "eval_runtime": 61.9418,
191
+ "eval_samples_per_second": 7.313,
192
+ "eval_steps_per_second": 0.92,
193
+ "step": 500
194
+ },
195
+ {
196
+ "epoch": 0.1850369184236278,
197
+ "grad_norm": 0.701486349105835,
198
+ "learning_rate": 0.00018775031946261064,
199
+ "loss": 0.5890274524688721,
200
+ "step": 520
201
+ },
202
+ {
203
+ "epoch": 0.1921537229783827,
204
+ "grad_norm": 0.8379756212234497,
205
+ "learning_rate": 0.00018662174157320512,
206
+ "loss": 0.5636214256286621,
207
+ "step": 540
208
+ },
209
+ {
210
+ "epoch": 0.19927052753313762,
211
+ "grad_norm": 0.9337961077690125,
212
+ "learning_rate": 0.0001854471469576289,
213
+ "loss": 0.6806112766265869,
214
+ "step": 560
215
+ },
216
+ {
217
+ "epoch": 0.20638733208789253,
218
+ "grad_norm": 0.7828953266143799,
219
+ "learning_rate": 0.00018422715960470738,
220
+ "loss": 0.5343317985534668,
221
+ "step": 580
222
+ },
223
+ {
224
+ "epoch": 0.21350413664264745,
225
+ "grad_norm": 0.5975359678268433,
226
+ "learning_rate": 0.00018296242761759498,
227
+ "loss": 0.6245263099670411,
228
+ "step": 600
229
+ },
230
+ {
231
+ "epoch": 0.22062094119740236,
232
+ "grad_norm": 0.6228912472724915,
233
+ "learning_rate": 0.00018165362286947815,
234
+ "loss": 0.661290979385376,
235
+ "step": 620
236
+ },
237
+ {
238
+ "epoch": 0.22773774575215727,
239
+ "grad_norm": 0.9198949933052063,
240
+ "learning_rate": 0.00018030144064665125,
241
+ "loss": 0.6715561866760253,
242
+ "step": 640
243
+ },
244
+ {
245
+ "epoch": 0.23485455030691219,
246
+ "grad_norm": 0.6074244976043701,
247
+ "learning_rate": 0.00017890659927915418,
248
+ "loss": 0.5627779006958008,
249
+ "step": 660
250
+ },
251
+ {
252
+ "epoch": 0.2419713548616671,
253
+ "grad_norm": 0.8562520146369934,
254
+ "learning_rate": 0.0001774698397591685,
255
+ "loss": 0.6850435256958007,
256
+ "step": 680
257
+ },
258
+ {
259
+ "epoch": 0.24908815941642204,
260
+ "grad_norm": 0.7890612483024597,
261
+ "learning_rate": 0.0001759919253473745,
262
+ "loss": 0.5971568584442138,
263
+ "step": 700
264
+ },
265
+ {
266
+ "epoch": 0.2562049639711769,
267
+ "grad_norm": 0.6242470741271973,
268
+ "learning_rate": 0.0001744736411674786,
269
+ "loss": 0.6526790618896484,
270
+ "step": 720
271
+ },
272
+ {
273
+ "epoch": 0.26332176852593187,
274
+ "grad_norm": 0.6237924098968506,
275
+ "learning_rate": 0.00017291579378912576,
276
+ "loss": 0.614622688293457,
277
+ "step": 740
278
+ },
279
+ {
280
+ "epoch": 0.27043857308068675,
281
+ "grad_norm": 1.250014066696167,
282
+ "learning_rate": 0.00017131921079941966,
283
+ "loss": 0.6886546611785889,
284
+ "step": 760
285
+ },
286
+ {
287
+ "epoch": 0.2775553776354417,
288
+ "grad_norm": 0.5347809791564941,
289
+ "learning_rate": 0.00016968474036327733,
290
+ "loss": 0.638498067855835,
291
+ "step": 780
292
+ },
293
+ {
294
+ "epoch": 0.2846721821901966,
295
+ "grad_norm": 0.7605909705162048,
296
+ "learning_rate": 0.0001680132507728518,
297
+ "loss": 0.6557466983795166,
298
+ "step": 800
299
+ },
300
+ {
301
+ "epoch": 0.2917889867449515,
302
+ "grad_norm": 0.647409975528717,
303
+ "learning_rate": 0.00016630562998626286,
304
+ "loss": 0.588128662109375,
305
+ "step": 820
306
+ },
307
+ {
308
+ "epoch": 0.29890579129970646,
309
+ "grad_norm": 0.5860188007354736,
310
+ "learning_rate": 0.00016456278515588024,
311
+ "loss": 0.6051214694976806,
312
+ "step": 840
313
+ },
314
+ {
315
+ "epoch": 0.30602259585446134,
316
+ "grad_norm": 0.875608503818512,
317
+ "learning_rate": 0.00016278564214641024,
318
+ "loss": 0.5963128089904786,
319
+ "step": 860
320
+ },
321
+ {
322
+ "epoch": 0.3131394004092163,
323
+ "grad_norm": 0.6618257164955139,
324
+ "learning_rate": 0.00016097514504304148,
325
+ "loss": 0.6630919933319092,
326
+ "step": 880
327
+ },
328
+ {
329
+ "epoch": 0.32025620496397117,
330
+ "grad_norm": 0.49601805210113525,
331
+ "learning_rate": 0.00015913225564991143,
332
+ "loss": 0.7227569103240967,
333
+ "step": 900
334
+ },
335
+ {
336
+ "epoch": 0.3273730095187261,
337
+ "grad_norm": 0.8401608467102051,
338
+ "learning_rate": 0.0001572579529791598,
339
+ "loss": 0.5685585975646973,
340
+ "step": 920
341
+ },
342
+ {
343
+ "epoch": 0.334489814073481,
344
+ "grad_norm": 0.5503961443901062,
345
+ "learning_rate": 0.00015535323273084062,
346
+ "loss": 0.6007286548614502,
347
+ "step": 940
348
+ },
349
+ {
350
+ "epoch": 0.34160661862823594,
351
+ "grad_norm": 0.8694468140602112,
352
+ "learning_rate": 0.0001534191067639688,
353
+ "loss": 0.5565601825714112,
354
+ "step": 960
355
+ },
356
+ {
357
+ "epoch": 0.3487234231829908,
358
+ "grad_norm": 0.5447700023651123,
359
+ "learning_rate": 0.00015145660255898262,
360
+ "loss": 0.5796232223510742,
361
+ "step": 980
362
+ },
363
+ {
364
+ "epoch": 0.35584022773774576,
365
+ "grad_norm": 0.7616499662399292,
366
+ "learning_rate": 0.00014946676267190752,
367
+ "loss": 0.5668922424316406,
368
+ "step": 1000
369
+ },
370
+ {
371
+ "epoch": 0.35584022773774576,
372
+ "eval_loss": 0.6924759149551392,
373
+ "eval_runtime": 61.6748,
374
+ "eval_samples_per_second": 7.345,
375
+ "eval_steps_per_second": 0.924,
376
+ "step": 1000
377
+ },
378
+ {
379
+ "epoch": 0.36295703229250065,
380
+ "grad_norm": 0.8111125230789185,
381
+ "learning_rate": 0.00014745064418051108,
382
+ "loss": 0.6030837535858155,
383
+ "step": 1020
384
+ },
385
+ {
386
+ "epoch": 0.3700738368472556,
387
+ "grad_norm": 0.5550377368927002,
388
+ "learning_rate": 0.00014540931812274358,
389
+ "loss": 0.6802701473236084,
390
+ "step": 1040
391
+ },
392
+ {
393
+ "epoch": 0.3771906414020105,
394
+ "grad_norm": 0.7379717230796814,
395
+ "learning_rate": 0.00014334386892776247,
396
+ "loss": 0.5119946956634521,
397
+ "step": 1060
398
+ },
399
+ {
400
+ "epoch": 0.3843074459567654,
401
+ "grad_norm": 0.6235134601593018,
402
+ "learning_rate": 0.00014125539383984264,
403
+ "loss": 0.606415843963623,
404
+ "step": 1080
405
+ },
406
+ {
407
+ "epoch": 0.3914242505115203,
408
+ "grad_norm": 1.4088711738586426,
409
+ "learning_rate": 0.00013914500233547908,
410
+ "loss": 0.6457336902618408,
411
+ "step": 1100
412
+ },
413
+ {
414
+ "epoch": 0.39854105506627524,
415
+ "grad_norm": 0.6606688499450684,
416
+ "learning_rate": 0.00013701381553399145,
417
+ "loss": 0.6700205326080322,
418
+ "step": 1120
419
+ },
420
+ {
421
+ "epoch": 0.4056578596210302,
422
+ "grad_norm": 0.6286507248878479,
423
+ "learning_rate": 0.0001348629656019429,
424
+ "loss": 0.5820858001708984,
425
+ "step": 1140
426
+ },
427
+ {
428
+ "epoch": 0.41277466417578507,
429
+ "grad_norm": 0.5580460429191589,
430
+ "learning_rate": 0.00013269359515169114,
431
+ "loss": 0.5733586311340332,
432
+ "step": 1160
433
+ },
434
+ {
435
+ "epoch": 0.41989146873054,
436
+ "grad_norm": 0.7134071588516235,
437
+ "learning_rate": 0.0001305068566343893,
438
+ "loss": 0.6202582359313965,
439
+ "step": 1180
440
+ },
441
+ {
442
+ "epoch": 0.4270082732852949,
443
+ "grad_norm": 0.7348489761352539,
444
+ "learning_rate": 0.000128303911727761,
445
+ "loss": 0.6173674583435058,
446
+ "step": 1200
447
+ },
448
+ {
449
+ "epoch": 0.43412507784004983,
450
+ "grad_norm": 1.1008996963500977,
451
+ "learning_rate": 0.0001260859307189731,
452
+ "loss": 0.61796555519104,
453
+ "step": 1220
454
+ },
455
+ {
456
+ "epoch": 0.4412418823948047,
457
+ "grad_norm": 0.9191972613334656,
458
+ "learning_rate": 0.0001238540918829353,
459
+ "loss": 0.6021127223968505,
460
+ "step": 1240
461
+ },
462
+ {
463
+ "epoch": 0.44835868694955966,
464
+ "grad_norm": 0.6578056216239929,
465
+ "learning_rate": 0.00012160958085635628,
466
+ "loss": 0.6018884658813477,
467
+ "step": 1260
468
+ },
469
+ {
470
+ "epoch": 0.45547549150431454,
471
+ "grad_norm": 0.6848057508468628,
472
+ "learning_rate": 0.00011935359000788873,
473
+ "loss": 0.5744600772857666,
474
+ "step": 1280
475
+ },
476
+ {
477
+ "epoch": 0.4625922960590695,
478
+ "grad_norm": 0.6195323467254639,
479
+ "learning_rate": 0.0001170873178046985,
480
+ "loss": 0.5539962768554687,
481
+ "step": 1300
482
+ },
483
+ {
484
+ "epoch": 0.46970910061382437,
485
+ "grad_norm": 0.6317474842071533,
486
+ "learning_rate": 0.00011481196817579352,
487
+ "loss": 0.5842368602752686,
488
+ "step": 1320
489
+ },
490
+ {
491
+ "epoch": 0.4768259051685793,
492
+ "grad_norm": 0.6639163494110107,
493
+ "learning_rate": 0.00011252874987245164,
494
+ "loss": 0.5572715759277344,
495
+ "step": 1340
496
+ },
497
+ {
498
+ "epoch": 0.4839427097233342,
499
+ "grad_norm": 0.519478976726532,
500
+ "learning_rate": 0.00011023887582608646,
501
+ "loss": 0.5544273376464843,
502
+ "step": 1360
503
+ },
504
+ {
505
+ "epoch": 0.49105951427808914,
506
+ "grad_norm": 0.7147398591041565,
507
+ "learning_rate": 0.0001079435625038925,
508
+ "loss": 0.5225166797637939,
509
+ "step": 1380
510
+ },
511
+ {
512
+ "epoch": 0.4981763188328441,
513
+ "grad_norm": 0.7628998160362244,
514
+ "learning_rate": 0.00010564402926261217,
515
+ "loss": 0.5659779071807861,
516
+ "step": 1400
517
+ },
518
+ {
519
+ "epoch": 0.505293123387599,
520
+ "grad_norm": 0.8272300958633423,
521
+ "learning_rate": 0.00010334149770076747,
522
+ "loss": 0.6124475955963135,
523
+ "step": 1420
524
+ },
525
+ {
526
+ "epoch": 0.5124099279423538,
527
+ "grad_norm": 0.6211256980895996,
528
+ "learning_rate": 0.00010103719100970115,
529
+ "loss": 0.5526745796203614,
530
+ "step": 1440
531
+ },
532
+ {
533
+ "epoch": 0.5195267324971088,
534
+ "grad_norm": 0.8070167899131775,
535
+ "learning_rate": 9.873233332377124e-05,
536
+ "loss": 0.5943079948425293,
537
+ "step": 1460
538
+ },
539
+ {
540
+ "epoch": 0.5266435370518637,
541
+ "grad_norm": 0.5424654483795166,
542
+ "learning_rate": 9.642814907004504e-05,
543
+ "loss": 0.5948707580566406,
544
+ "step": 1480
545
+ },
546
+ {
547
+ "epoch": 0.5337603416066187,
548
+ "grad_norm": 0.7003446817398071,
549
+ "learning_rate": 9.41258623178373e-05,
550
+ "loss": 0.6317638397216797,
551
+ "step": 1500
552
+ },
553
+ {
554
+ "epoch": 0.5337603416066187,
555
+ "eval_loss": 0.6935540437698364,
556
+ "eval_runtime": 61.6485,
557
+ "eval_samples_per_second": 7.348,
558
+ "eval_steps_per_second": 0.925,
559
+ "step": 1500
560
+ },
561
+ {
562
+ "epoch": 0.5408771461613735,
563
+ "grad_norm": 0.6322863101959229,
564
+ "learning_rate": 9.182669612843861e-05,
565
+ "loss": 0.665757417678833,
566
+ "step": 1520
567
+ },
568
+ {
569
+ "epoch": 0.5479939507161284,
570
+ "grad_norm": 0.6435247659683228,
571
+ "learning_rate": 8.953187190537928e-05,
572
+ "loss": 0.4951322078704834,
573
+ "step": 1540
574
+ },
575
+ {
576
+ "epoch": 0.5551107552708834,
577
+ "grad_norm": 0.8010550141334534,
578
+ "learning_rate": 8.724260874557384e-05,
579
+ "loss": 0.6802570343017578,
580
+ "step": 1560
581
+ },
582
+ {
583
+ "epoch": 0.5622275598256383,
584
+ "grad_norm": 0.8338032960891724,
585
+ "learning_rate": 8.496012279169097e-05,
586
+ "loss": 0.5899542331695556,
587
+ "step": 1580
588
+ },
589
+ {
590
+ "epoch": 0.5693443643803932,
591
+ "grad_norm": 0.7749608159065247,
592
+ "learning_rate": 8.268562658609254e-05,
593
+ "loss": 0.6368399620056152,
594
+ "step": 1600
595
+ },
596
+ {
597
+ "epoch": 0.5764611689351481,
598
+ "grad_norm": 0.7659400105476379,
599
+ "learning_rate": 8.042032842668596e-05,
600
+ "loss": 0.5484944820404053,
601
+ "step": 1620
602
+ },
603
+ {
604
+ "epoch": 0.583577973489903,
605
+ "grad_norm": 1.1483023166656494,
606
+ "learning_rate": 7.816543172503053e-05,
607
+ "loss": 0.554067325592041,
608
+ "step": 1640
609
+ },
610
+ {
611
+ "epoch": 0.590694778044658,
612
+ "grad_norm": 0.6539034247398376,
613
+ "learning_rate": 7.592213436704003e-05,
614
+ "loss": 0.5624193668365478,
615
+ "step": 1660
616
+ },
617
+ {
618
+ "epoch": 0.5978115825994129,
619
+ "grad_norm": 0.6587591767311096,
620
+ "learning_rate": 7.369162807662087e-05,
621
+ "loss": 0.6478344917297363,
622
+ "step": 1680
623
+ },
624
+ {
625
+ "epoch": 0.6049283871541677,
626
+ "grad_norm": 0.5699108839035034,
627
+ "learning_rate": 7.147509778258334e-05,
628
+ "loss": 0.6776344299316406,
629
+ "step": 1700
630
+ },
631
+ {
632
+ "epoch": 0.6120451917089227,
633
+ "grad_norm": 0.6976863145828247,
634
+ "learning_rate": 6.927372098916294e-05,
635
+ "loss": 0.615149450302124,
636
+ "step": 1720
637
+ },
638
+ {
639
+ "epoch": 0.6191619962636776,
640
+ "grad_norm": 0.7388312220573425,
641
+ "learning_rate": 6.708866715048585e-05,
642
+ "loss": 0.6845808982849121,
643
+ "step": 1740
644
+ },
645
+ {
646
+ "epoch": 0.6262788008184326,
647
+ "grad_norm": 0.5257291793823242,
648
+ "learning_rate": 6.492109704931101e-05,
649
+ "loss": 0.6309232234954834,
650
+ "step": 1760
651
+ },
652
+ {
653
+ "epoch": 0.6333956053731874,
654
+ "grad_norm": 0.7461546063423157,
655
+ "learning_rate": 6.277216218037849e-05,
656
+ "loss": 0.6485635757446289,
657
+ "step": 1780
658
+ },
659
+ {
660
+ "epoch": 0.6405124099279423,
661
+ "grad_norm": 0.7209606766700745,
662
+ "learning_rate": 6.0643004138692375e-05,
663
+ "loss": 0.6348707675933838,
664
+ "step": 1800
665
+ },
666
+ {
667
+ "epoch": 0.6476292144826973,
668
+ "grad_norm": 0.6697937250137329,
669
+ "learning_rate": 5.853475401306241e-05,
670
+ "loss": 0.5393397331237793,
671
+ "step": 1820
672
+ },
673
+ {
674
+ "epoch": 0.6547460190374522,
675
+ "grad_norm": 0.6282167434692383,
676
+ "learning_rate": 5.644853178522733e-05,
677
+ "loss": 0.7485171794891358,
678
+ "step": 1840
679
+ },
680
+ {
681
+ "epoch": 0.661862823592207,
682
+ "grad_norm": 0.6313266158103943,
683
+ "learning_rate": 5.438544573487811e-05,
684
+ "loss": 0.5625770092010498,
685
+ "step": 1860
686
+ },
687
+ {
688
+ "epoch": 0.668979628146962,
689
+ "grad_norm": 0.6948370933532715,
690
+ "learning_rate": 5.23465918508984e-05,
691
+ "loss": 0.6276655197143555,
692
+ "step": 1880
693
+ },
694
+ {
695
+ "epoch": 0.6760964327017169,
696
+ "grad_norm": 0.6330398321151733,
697
+ "learning_rate": 5.0333053249133924e-05,
698
+ "loss": 0.541553258895874,
699
+ "step": 1900
700
+ },
701
+ {
702
+ "epoch": 0.6832132372564719,
703
+ "grad_norm": 0.7645998001098633,
704
+ "learning_rate": 4.834589959700061e-05,
705
+ "loss": 0.5345258235931396,
706
+ "step": 1920
707
+ },
708
+ {
709
+ "epoch": 0.6903300418112268,
710
+ "grad_norm": 0.6248979568481445,
711
+ "learning_rate": 4.6386186545237054e-05,
712
+ "loss": 0.6255132675170898,
713
+ "step": 1940
714
+ },
715
+ {
716
+ "epoch": 0.6974468463659816,
717
+ "grad_norm": 0.8874566555023193,
718
+ "learning_rate": 4.445495516710312e-05,
719
+ "loss": 0.5986682891845703,
720
+ "step": 1960
721
+ },
722
+ {
723
+ "epoch": 0.7045636509207366,
724
+ "grad_norm": 0.7236739993095398,
725
+ "learning_rate": 4.2553231405322724e-05,
726
+ "loss": 0.6396020412445068,
727
+ "step": 1980
728
+ },
729
+ {
730
+ "epoch": 0.7116804554754915,
731
+ "grad_norm": 0.57773357629776,
732
+ "learning_rate": 4.0682025527064486e-05,
733
+ "loss": 0.5397399425506592,
734
+ "step": 2000
735
+ },
736
+ {
737
+ "epoch": 0.7116804554754915,
738
+ "eval_loss": 0.6902480721473694,
739
+ "eval_runtime": 61.8578,
740
+ "eval_samples_per_second": 7.323,
741
+ "eval_steps_per_second": 0.921,
742
+ "step": 2000
743
+ },
744
+ {
745
+ "epoch": 0.7187972600302465,
746
+ "grad_norm": 0.9456912875175476,
747
+ "learning_rate": 3.8842331587249756e-05,
748
+ "loss": 0.5803652286529541,
749
+ "step": 2020
750
+ },
751
+ {
752
+ "epoch": 0.7259140645850013,
753
+ "grad_norm": 0.6886982321739197,
754
+ "learning_rate": 3.703512690047336e-05,
755
+ "loss": 0.6748288154602051,
756
+ "step": 2040
757
+ },
758
+ {
759
+ "epoch": 0.7330308691397562,
760
+ "grad_norm": 0.6338731050491333,
761
+ "learning_rate": 3.5261371521817244e-05,
762
+ "loss": 0.47889223098754885,
763
+ "step": 2060
764
+ },
765
+ {
766
+ "epoch": 0.7401476736945112,
767
+ "grad_norm": 0.6906462907791138,
768
+ "learning_rate": 3.352200773683317e-05,
769
+ "loss": 0.6045923233032227,
770
+ "step": 2080
771
+ },
772
+ {
773
+ "epoch": 0.7472644782492661,
774
+ "grad_norm": 0.8363668322563171,
775
+ "learning_rate": 3.1817959560965215e-05,
776
+ "loss": 0.5399829387664795,
777
+ "step": 2100
778
+ },
779
+ {
780
+ "epoch": 0.754381282804021,
781
+ "grad_norm": 0.5785157680511475,
782
+ "learning_rate": 3.0150132248677976e-05,
783
+ "loss": 0.64073805809021,
784
+ "step": 2120
785
+ },
786
+ {
787
+ "epoch": 0.7614980873587759,
788
+ "grad_norm": 0.7285847663879395,
789
+ "learning_rate": 2.8519411812551388e-05,
790
+ "loss": 0.6770923137664795,
791
+ "step": 2140
792
+ },
793
+ {
794
+ "epoch": 0.7686148919135308,
795
+ "grad_norm": 0.5978537201881409,
796
+ "learning_rate": 2.6926664552597535e-05,
797
+ "loss": 0.6357324123382568,
798
+ "step": 2160
799
+ },
800
+ {
801
+ "epoch": 0.7757316964682858,
802
+ "grad_norm": 0.5783416032791138,
803
+ "learning_rate": 2.5372736596049418e-05,
804
+ "loss": 0.5521987438201904,
805
+ "step": 2180
806
+ },
807
+ {
808
+ "epoch": 0.7828485010230406,
809
+ "grad_norm": 0.6857144236564636,
810
+ "learning_rate": 2.3858453447866557e-05,
811
+ "loss": 0.6403320789337158,
812
+ "step": 2200
813
+ },
814
+ {
815
+ "epoch": 0.7899653055777955,
816
+ "grad_norm": 0.6665685772895813,
817
+ "learning_rate": 2.2384619552195518e-05,
818
+ "loss": 0.5500887393951416,
819
+ "step": 2220
820
+ },
821
+ {
822
+ "epoch": 0.7970821101325505,
823
+ "grad_norm": 0.6291202306747437,
824
+ "learning_rate": 2.0952017865019036e-05,
825
+ "loss": 0.5161121368408204,
826
+ "step": 2240
827
+ },
828
+ {
829
+ "epoch": 0.8041989146873054,
830
+ "grad_norm": 0.487289696931839,
831
+ "learning_rate": 1.9561409438220247e-05,
832
+ "loss": 0.547119140625,
833
+ "step": 2260
834
+ },
835
+ {
836
+ "epoch": 0.8113157192420604,
837
+ "grad_norm": 0.6173009872436523,
838
+ "learning_rate": 1.8213533015283525e-05,
839
+ "loss": 0.6167063236236572,
840
+ "step": 2280
841
+ },
842
+ {
843
+ "epoch": 0.8184325237968152,
844
+ "grad_norm": 0.7068958282470703,
845
+ "learning_rate": 1.6909104638845986e-05,
846
+ "loss": 0.5167754173278809,
847
+ "step": 2300
848
+ },
849
+ {
850
+ "epoch": 0.8255493283515701,
851
+ "grad_norm": 0.8526712656021118,
852
+ "learning_rate": 1.5648817270308645e-05,
853
+ "loss": 0.5217617511749267,
854
+ "step": 2320
855
+ },
856
+ {
857
+ "epoch": 0.8326661329063251,
858
+ "grad_norm": 0.7311387658119202,
859
+ "learning_rate": 1.4433340421709595e-05,
860
+ "loss": 0.6144563674926757,
861
+ "step": 2340
862
+ },
863
+ {
864
+ "epoch": 0.83978293746108,
865
+ "grad_norm": 0.5354213118553162,
866
+ "learning_rate": 1.3263319800053697e-05,
867
+ "loss": 0.6949819087982178,
868
+ "step": 2360
869
+ },
870
+ {
871
+ "epoch": 0.8468997420158348,
872
+ "grad_norm": 0.7716789841651917,
873
+ "learning_rate": 1.2139376964288852e-05,
874
+ "loss": 0.5799826145172119,
875
+ "step": 2380
876
+ },
877
+ {
878
+ "epoch": 0.8540165465705898,
879
+ "grad_norm": 0.7061272263526917,
880
+ "learning_rate": 1.1062108995110565e-05,
881
+ "loss": 0.5493000030517579,
882
+ "step": 2400
883
+ },
884
+ {
885
+ "epoch": 0.8611333511253447,
886
+ "grad_norm": 0.49559611082077026,
887
+ "learning_rate": 1.003208817777025e-05,
888
+ "loss": 0.5216735363006592,
889
+ "step": 2420
890
+ },
891
+ {
892
+ "epoch": 0.8682501556800997,
893
+ "grad_norm": 0.737585186958313,
894
+ "learning_rate": 9.049861698055694e-06,
895
+ "loss": 0.5979462623596191,
896
+ "step": 2440
897
+ },
898
+ {
899
+ "epoch": 0.8753669602348545,
900
+ "grad_norm": 0.745577871799469,
901
+ "learning_rate": 8.115951351605378e-06,
902
+ "loss": 0.5613039016723633,
903
+ "step": 2460
904
+ },
905
+ {
906
+ "epoch": 0.8824837647896094,
907
+ "grad_norm": 0.686896026134491,
908
+ "learning_rate": 7.230853266711124e-06,
909
+ "loss": 0.7399949073791504,
910
+ "step": 2480
911
+ },
912
+ {
913
+ "epoch": 0.8896005693443644,
914
+ "grad_norm": 0.47512194514274597,
915
+ "learning_rate": 6.395037640756074e-06,
916
+ "loss": 0.6489872455596923,
917
+ "step": 2500
918
+ },
919
+ {
920
+ "epoch": 0.8896005693443644,
921
+ "eval_loss": 0.6895355582237244,
922
+ "eval_runtime": 61.7594,
923
+ "eval_samples_per_second": 7.335,
924
+ "eval_steps_per_second": 0.923,
925
+ "step": 2500
926
+ },
927
+ {
928
+ "epoch": 0.8967173738991193,
929
+ "grad_norm": 0.720199704170227,
930
+ "learning_rate": 5.608948490428023e-06,
931
+ "loss": 0.5943216800689697,
932
+ "step": 2520
933
+ },
934
+ {
935
+ "epoch": 0.9038341784538743,
936
+ "grad_norm": 0.5467908382415771,
937
+ "learning_rate": 4.8730034158412155e-06,
938
+ "loss": 0.6081305980682373,
939
+ "step": 2540
940
+ },
941
+ {
942
+ "epoch": 0.9109509830086291,
943
+ "grad_norm": 0.6065810918807983,
944
+ "learning_rate": 4.1875933786914345e-06,
945
+ "loss": 0.5498838424682617,
946
+ "step": 2560
947
+ },
948
+ {
949
+ "epoch": 0.918067787563384,
950
+ "grad_norm": 0.678969144821167,
951
+ "learning_rate": 3.5530824945623542e-06,
952
+ "loss": 0.5988584518432617,
953
+ "step": 2580
954
+ },
955
+ {
956
+ "epoch": 0.925184592118139,
957
+ "grad_norm": 0.5680024027824402,
958
+ "learning_rate": 2.969807839493732e-06,
959
+ "loss": 0.5618189811706543,
960
+ "step": 2600
961
+ },
962
+ {
963
+ "epoch": 0.9323013966728939,
964
+ "grad_norm": 0.5868266224861145,
965
+ "learning_rate": 2.438079270913951e-06,
966
+ "loss": 0.47838735580444336,
967
+ "step": 2620
968
+ },
969
+ {
970
+ "epoch": 0.9394182012276487,
971
+ "grad_norm": 0.5550084710121155,
972
+ "learning_rate": 1.9581792630320784e-06,
973
+ "loss": 0.5886428833007813,
974
+ "step": 2640
975
+ },
976
+ {
977
+ "epoch": 0.9465350057824037,
978
+ "grad_norm": 0.7638676762580872,
979
+ "learning_rate": 1.5303627567769041e-06,
980
+ "loss": 0.7057912826538086,
981
+ "step": 2660
982
+ },
983
+ {
984
+ "epoch": 0.9536518103371586,
985
+ "grad_norm": 2.5887210369110107,
986
+ "learning_rate": 1.1548570243627987e-06,
987
+ "loss": 0.5450192451477051,
988
+ "step": 2680
989
+ },
990
+ {
991
+ "epoch": 0.9607686148919136,
992
+ "grad_norm": 0.606455385684967,
993
+ "learning_rate": 8.31861548554147e-07,
994
+ "loss": 0.5962701797485351,
995
+ "step": 2700
996
+ },
997
+ {
998
+ "epoch": 0.9678854194466684,
999
+ "grad_norm": 0.8856239318847656,
1000
+ "learning_rate": 5.61547916692573e-07,
1001
+ "loss": 0.5428257942199707,
1002
+ "step": 2720
1003
+ },
1004
+ {
1005
+ "epoch": 0.9750022240014233,
1006
+ "grad_norm": 0.5785204172134399,
1007
+ "learning_rate": 3.440597295433445e-07,
1008
+ "loss": 0.647684907913208,
1009
+ "step": 2740
1010
+ },
1011
+ {
1012
+ "epoch": 0.9821190285561783,
1013
+ "grad_norm": 0.6091363430023193,
1014
+ "learning_rate": 1.795125250092067e-07,
1015
+ "loss": 0.6739026069641113,
1016
+ "step": 2760
1017
+ },
1018
+ {
1019
+ "epoch": 0.9892358331109332,
1020
+ "grad_norm": 0.6739264726638794,
1021
+ "learning_rate": 6.799371675230148e-08,
1022
+ "loss": 0.5888994216918946,
1023
+ "step": 2780
1024
+ },
1025
+ {
1026
+ "epoch": 0.9963526376656882,
1027
+ "grad_norm": 0.7204112410545349,
1028
+ "learning_rate": 9.56254775678067e-09,
1029
+ "loss": 0.6584902763366699,
1030
+ "step": 2800
1031
+ }
1032
+ ],
1033
+ "logging_steps": 20,
1034
+ "max_steps": 2811,
1035
+ "num_input_tokens_seen": 0,
1036
+ "num_train_epochs": 1,
1037
+ "save_steps": 500,
1038
+ "stateful_callbacks": {
1039
+ "TrainerControl": {
1040
+ "args": {
1041
+ "should_epoch_stop": false,
1042
+ "should_evaluate": false,
1043
+ "should_log": false,
1044
+ "should_save": true,
1045
+ "should_training_stop": true
1046
+ },
1047
+ "attributes": {}
1048
+ }
1049
+ },
1050
+ "total_flos": 1.802380477631742e+18,
1051
+ "train_batch_size": 1,
1052
+ "trial_name": null,
1053
+ "trial_params": null
1054
+ }
checkpoint-2811/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c08bb00e1e243b1f9e9b6645a8d5bd488ad57e7beb1e85e7b7cf4fda61f49e32
3
+ size 5137
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8
3
+ size 11421892
tokenizer_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "backend": "tokenizers",
4
+ "bos_token": null,
5
+ "clean_up_tokenization_spaces": false,
6
+ "eos_token": "<|endoftext|>",
7
+ "errors": "replace",
8
+ "extra_special_tokens": [
9
+ "<|im_start|>",
10
+ "<|im_end|>",
11
+ "<|object_ref_start|>",
12
+ "<|object_ref_end|>",
13
+ "<|box_start|>",
14
+ "<|box_end|>",
15
+ "<|quad_start|>",
16
+ "<|quad_end|>",
17
+ "<|vision_start|>",
18
+ "<|vision_end|>",
19
+ "<|vision_pad|>",
20
+ "<|image_pad|>",
21
+ "<|video_pad|>"
22
+ ],
23
+ "is_local": false,
24
+ "model_max_length": 32768,
25
+ "pad_token": "<|endoftext|>",
26
+ "split_special_tokens": false,
27
+ "tokenizer_class": "Qwen2Tokenizer",
28
+ "unk_token": null
29
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c08bb00e1e243b1f9e9b6645a8d5bd488ad57e7beb1e85e7b7cf4fda61f49e32
3
+ size 5137