beezza commited on
Commit
b8713b7
·
verified ·
1 Parent(s): 00fae49

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: unsloth/gemma-4-e2b-it-unsloth-bnb-4bit
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:unsloth/gemma-4-e2b-it-unsloth-bnb-4bit
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ - unsloth
12
+ ---
13
+
14
+ # Model Card for Model ID
15
+
16
+ <!-- Provide a quick summary of what the model is/does. -->
17
+
18
+
19
+
20
+ ## Model Details
21
+
22
+ ### Model Description
23
+
24
+ <!-- Provide a longer summary of what this model is. -->
25
+
26
+
27
+
28
+ - **Developed by:** [More Information Needed]
29
+ - **Funded by [optional]:** [More Information Needed]
30
+ - **Shared by [optional]:** [More Information Needed]
31
+ - **Model type:** [More Information Needed]
32
+ - **Language(s) (NLP):** [More Information Needed]
33
+ - **License:** [More Information Needed]
34
+ - **Finetuned from model [optional]:** [More Information Needed]
35
+
36
+ ### Model Sources [optional]
37
+
38
+ <!-- Provide the basic links for the model. -->
39
+
40
+ - **Repository:** [More Information Needed]
41
+ - **Paper [optional]:** [More Information Needed]
42
+ - **Demo [optional]:** [More Information Needed]
43
+
44
+ ## Uses
45
+
46
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
47
+
48
+ ### Direct Use
49
+
50
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
51
+
52
+ [More Information Needed]
53
+
54
+ ### Downstream Use [optional]
55
+
56
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
57
+
58
+ [More Information Needed]
59
+
60
+ ### Out-of-Scope Use
61
+
62
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
63
+
64
+ [More Information Needed]
65
+
66
+ ## Bias, Risks, and Limitations
67
+
68
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
69
+
70
+ [More Information Needed]
71
+
72
+ ### Recommendations
73
+
74
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
75
+
76
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
77
+
78
+ ## How to Get Started with the Model
79
+
80
+ Use the code below to get started with the model.
81
+
82
+ [More Information Needed]
83
+
84
+ ## Training Details
85
+
86
+ ### Training Data
87
+
88
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
89
+
90
+ [More Information Needed]
91
+
92
+ ### Training Procedure
93
+
94
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
95
+
96
+ #### Preprocessing [optional]
97
+
98
+ [More Information Needed]
99
+
100
+
101
+ #### Training Hyperparameters
102
+
103
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
104
+
105
+ #### Speeds, Sizes, Times [optional]
106
+
107
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
108
+
109
+ [More Information Needed]
110
+
111
+ ## Evaluation
112
+
113
+ <!-- This section describes the evaluation protocols and provides the results. -->
114
+
115
+ ### Testing Data, Factors & Metrics
116
+
117
+ #### Testing Data
118
+
119
+ <!-- This should link to a Dataset Card if possible. -->
120
+
121
+ [More Information Needed]
122
+
123
+ #### Factors
124
+
125
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
126
+
127
+ [More Information Needed]
128
+
129
+ #### Metrics
130
+
131
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
132
+
133
+ [More Information Needed]
134
+
135
+ ### Results
136
+
137
+ [More Information Needed]
138
+
139
+ #### Summary
140
+
141
+
142
+
143
+ ## Model Examination [optional]
144
+
145
+ <!-- Relevant interpretability work for the model goes here -->
146
+
147
+ [More Information Needed]
148
+
149
+ ## Environmental Impact
150
+
151
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
152
+
153
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
154
+
155
+ - **Hardware Type:** [More Information Needed]
156
+ - **Hours used:** [More Information Needed]
157
+ - **Cloud Provider:** [More Information Needed]
158
+ - **Compute Region:** [More Information Needed]
159
+ - **Carbon Emitted:** [More Information Needed]
160
+
161
+ ## Technical Specifications [optional]
162
+
163
+ ### Model Architecture and Objective
164
+
165
+ [More Information Needed]
166
+
167
+ ### Compute Infrastructure
168
+
169
+ [More Information Needed]
170
+
171
+ #### Hardware
172
+
173
+ [More Information Needed]
174
+
175
+ #### Software
176
+
177
+ [More Information Needed]
178
+
179
+ ## Citation [optional]
180
+
181
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
182
+
183
+ **BibTeX:**
184
+
185
+ [More Information Needed]
186
+
187
+ **APA:**
188
+
189
+ [More Information Needed]
190
+
191
+ ## Glossary [optional]
192
+
193
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
194
+
195
+ [More Information Needed]
196
+
197
+ ## More Information [optional]
198
+
199
+ [More Information Needed]
200
+
201
+ ## Model Card Authors [optional]
202
+
203
+ [More Information Needed]
204
+
205
+ ## Model Card Contact
206
+
207
+ [More Information Needed]
208
+ ### Framework versions
209
+
210
+ - PEFT 0.18.1
adapter_config.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": {
6
+ "base_model_class": "Gemma4ForConditionalGeneration",
7
+ "parent_library": "transformers.models.gemma4.modeling_gemma4",
8
+ "unsloth_fixed": true
9
+ },
10
+ "base_model_name_or_path": "unsloth/gemma-4-e2b-it-unsloth-bnb-4bit",
11
+ "bias": "none",
12
+ "corda_config": null,
13
+ "ensure_weight_tying": false,
14
+ "eva_config": null,
15
+ "exclude_modules": null,
16
+ "fan_in_fan_out": false,
17
+ "inference_mode": true,
18
+ "init_lora_weights": true,
19
+ "layer_replication": null,
20
+ "layers_pattern": null,
21
+ "layers_to_transform": null,
22
+ "loftq_config": {},
23
+ "lora_alpha": 128,
24
+ "lora_bias": false,
25
+ "lora_dropout": 0.0,
26
+ "megatron_config": null,
27
+ "megatron_core": "megatron.core",
28
+ "modules_to_save": null,
29
+ "peft_type": "LORA",
30
+ "peft_version": "0.18.1",
31
+ "qalora_group_size": 16,
32
+ "r": 64,
33
+ "rank_pattern": {},
34
+ "revision": null,
35
+ "target_modules": [
36
+ "up_proj",
37
+ "v_proj",
38
+ "q_proj",
39
+ "k_proj",
40
+ "o_proj",
41
+ "gate_proj",
42
+ "down_proj"
43
+ ],
44
+ "target_parameters": null,
45
+ "task_type": "CAUSAL_LM",
46
+ "trainable_token_indices": null,
47
+ "use_dora": false,
48
+ "use_qalora": false,
49
+ "use_rslora": false
50
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba4bd0236cb0ec67d26d56301ddf84aef566b75ddf4f2503451d5aaff0921d55
3
+ size 496752408
chat_template.jinja ADDED
@@ -0,0 +1,351 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- macro format_parameters(properties, required, filter_keys=false) -%}
2
+ {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
3
+ {%- set ns = namespace(found_first=false) -%}
4
+ {%- for key, value in properties | dictsort -%}
5
+ {%- set add_comma = false -%}
6
+ {%- if not filter_keys or key not in standard_keys -%}
7
+ {%- if ns.found_first %},{% endif -%}
8
+ {%- set ns.found_first = true -%}
9
+ {{ key }}:{
10
+ {%- if value['description'] -%}
11
+ description:<|"|>{{ value['description'] }}<|"|>
12
+ {%- set add_comma = true -%}
13
+ {%- endif -%}
14
+ {%- if value['type'] | upper == 'STRING' -%}
15
+ {%- if value['enum'] -%}
16
+ {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
17
+ enum:{{ format_argument(value['enum']) }}
18
+ {%- endif -%}
19
+ {%- elif value['type'] | upper == 'ARRAY' -%}
20
+ {%- if value['items'] is mapping and value['items'] -%}
21
+ {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
22
+ items:{
23
+ {%- set ns_items = namespace(found_first=false) -%}
24
+ {%- for item_key, item_value in value['items'] | dictsort -%}
25
+ {%- if item_value is not none -%}
26
+ {%- if ns_items.found_first %},{% endif -%}
27
+ {%- set ns_items.found_first = true -%}
28
+ {%- if item_key == 'properties' -%}
29
+ properties:{
30
+ {%- if item_value is mapping -%}
31
+ {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
32
+ {%- endif -%}
33
+ }
34
+ {%- elif item_key == 'required' -%}
35
+ required:[
36
+ {%- for req_item in item_value -%}
37
+ <|"|>{{- req_item -}}<|"|>
38
+ {%- if not loop.last %},{% endif -%}
39
+ {%- endfor -%}
40
+ ]
41
+ {%- elif item_key == 'type' -%}
42
+ {%- if item_value is string -%}
43
+ type:{{ format_argument(item_value | upper) }}
44
+ {%- else -%}
45
+ type:{{ format_argument(item_value | map('upper') | list) }}
46
+ {%- endif -%}
47
+ {%- else -%}
48
+ {{ item_key }}:{{ format_argument(item_value) }}
49
+ {%- endif -%}
50
+ {%- endif -%}
51
+ {%- endfor -%}
52
+ }
53
+ {%- endif -%}
54
+ {%- endif -%}
55
+ {%- if value['nullable'] %}
56
+ {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
57
+ nullable:true
58
+ {%- endif -%}
59
+ {%- if value['type'] | upper == 'OBJECT' -%}
60
+ {%- if value['properties'] is defined and value['properties'] is mapping -%}
61
+ {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
62
+ properties:{
63
+ {{- format_parameters(value['properties'], value['required'] | default([])) -}}
64
+ }
65
+ {%- elif value is mapping -%}
66
+ {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
67
+ properties:{
68
+ {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
69
+ }
70
+ {%- endif -%}
71
+ {%- if value['required'] -%}
72
+ {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
73
+ required:[
74
+ {%- for item in value['required'] | default([]) -%}
75
+ <|"|>{{- item -}}<|"|>
76
+ {%- if not loop.last %},{% endif -%}
77
+ {%- endfor -%}
78
+ ]
79
+ {%- endif -%}
80
+ {%- endif -%}
81
+ {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
82
+ type:<|"|>{{ value['type'] | upper }}<|"|>}
83
+ {%- endif -%}
84
+ {%- endfor -%}
85
+ {%- endmacro -%}
86
+ {%- macro format_function_declaration(tool_data) -%}
87
+ declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
88
+ {%- set params = tool_data['function']['parameters'] -%}
89
+ {%- if params -%}
90
+ ,parameters:{
91
+ {%- if params['properties'] -%}
92
+ properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
93
+ {%- endif -%}
94
+ {%- if params['required'] -%}
95
+ required:[
96
+ {%- for item in params['required'] -%}
97
+ <|"|>{{- item -}}<|"|>
98
+ {{- ',' if not loop.last -}}
99
+ {%- endfor -%}
100
+ ],
101
+ {%- endif -%}
102
+ {%- if params['type'] -%}
103
+ type:<|"|>{{- params['type'] | upper -}}<|"|>}
104
+ {%- endif -%}
105
+ {%- endif -%}
106
+ {%- if 'response' in tool_data['function'] -%}
107
+ {%- set response_declaration = tool_data['function']['response'] -%}
108
+ ,response:{
109
+ {%- if response_declaration['description'] -%}
110
+ description:<|"|>{{- response_declaration['description'] -}}<|"|>,
111
+ {%- endif -%}
112
+ {%- if response_declaration['type'] | upper == 'OBJECT' -%}
113
+ type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
114
+ {%- endif -%}
115
+ {%- endif -%}
116
+ }
117
+ {%- endmacro -%}
118
+ {%- macro format_argument(argument, escape_keys=True) -%}
119
+ {%- if argument is string -%}
120
+ {{- '<|"|>' + argument + '<|"|>' -}}
121
+ {%- elif argument is boolean -%}
122
+ {{- 'true' if argument else 'false' -}}
123
+ {%- elif argument is mapping -%}
124
+ {{- '{' -}}
125
+ {%- set ns = namespace(found_first=false) -%}
126
+ {%- for key, value in argument | dictsort -%}
127
+ {%- if ns.found_first %},{% endif -%}
128
+ {%- set ns.found_first = true -%}
129
+ {%- if escape_keys -%}
130
+ {{- '<|"|>' + key + '<|"|>' -}}
131
+ {%- else -%}
132
+ {{- key -}}
133
+ {%- endif -%}
134
+ :{{- format_argument(value, escape_keys=escape_keys) -}}
135
+ {%- endfor -%}
136
+ {{- '}' -}}
137
+ {%- elif argument is sequence -%}
138
+ {{- '[' -}}
139
+ {%- for item in argument -%}
140
+ {{- format_argument(item, escape_keys=escape_keys) -}}
141
+ {%- if not loop.last %},{% endif -%}
142
+ {%- endfor -%}
143
+ {{- ']' -}}
144
+ {%- else -%}
145
+ {{- argument -}}
146
+ {%- endif -%}
147
+ {%- endmacro -%}
148
+ {%- macro strip_thinking(text) -%}
149
+ {%- set ns = namespace(result='') -%}
150
+ {%- for part in text.split('<channel|>') -%}
151
+ {%- if '<|channel>' in part -%}
152
+ {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
153
+ {%- else -%}
154
+ {%- set ns.result = ns.result + part -%}
155
+ {%- endif -%}
156
+ {%- endfor -%}
157
+ {{- ns.result | trim -}}
158
+ {%- endmacro -%}
159
+
160
+ {%- macro format_tool_response_block(tool_name, response) -%}
161
+ {{- '<|tool_response>' -}}
162
+ {%- if response is mapping -%}
163
+ {{- 'response:' + tool_name + '{' -}}
164
+ {%- for key, value in response | dictsort -%}
165
+ {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
166
+ {%- if not loop.last %},{% endif -%}
167
+ {%- endfor -%}
168
+ {{- '}' -}}
169
+ {%- else -%}
170
+ {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
171
+ {%- endif -%}
172
+ {{- '<tool_response|>' -}}
173
+ {%- endmacro -%}
174
+
175
+ {%- set ns = namespace(prev_message_type=None) -%}
176
+ {%- set loop_messages = messages -%}
177
+ {{- bos_token -}}
178
+ {#- Handle System/Tool Definitions Block -#}
179
+ {%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
180
+ {{- '<|turn>system\n' -}}
181
+ {#- Inject Thinking token at the very top of the FIRST system turn -#}
182
+ {%- if enable_thinking is defined and enable_thinking -%}
183
+ {{- '<|think|>\n' -}}
184
+ {%- set ns.prev_message_type = 'think' -%}
185
+ {%- endif -%}
186
+ {%- if messages[0]['role'] in ['system', 'developer'] -%}
187
+ {%- if messages[0]['content'] is string -%}
188
+ {{- messages[0]['content'] | trim -}}
189
+ {%- elif messages[0]['content'] is sequence -%}
190
+ {%- for item in messages[0]['content'] -%}
191
+ {{- item['text'] | trim + ' '-}}
192
+ {%- endfor -%}
193
+ {%- endif -%}
194
+ {%- set loop_messages = messages[1:] -%}
195
+ {%- endif -%}
196
+ {%- if tools -%}
197
+ {%- for tool in tools %}
198
+ {{- '<|tool>' -}}
199
+ {{- format_function_declaration(tool) | trim -}}
200
+ {{- '<tool|>' -}}
201
+ {%- endfor %}
202
+ {%- set ns.prev_message_type = 'tool' -%}
203
+ {%- endif -%}
204
+ {{- '<turn|>\n' -}}
205
+ {%- endif %}
206
+
207
+ {#- Pre-scan: find last user message index for reasoning guard -#}
208
+ {%- set ns_turn = namespace(last_user_idx=-1) -%}
209
+ {%- for i in range(loop_messages | length) -%}
210
+ {%- if loop_messages[i]['role'] == 'user' -%}
211
+ {%- set ns_turn.last_user_idx = i -%}
212
+ {%- endif -%}
213
+ {%- endfor -%}
214
+
215
+ {#- Loop through messages -#}
216
+ {%- for message in loop_messages -%}
217
+ {%- if message['role'] != 'tool' -%}
218
+ {%- set ns.prev_message_type = None -%}
219
+ {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
220
+ {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
221
+ {%- set prev_nt = namespace(role=None, found=false) -%}
222
+ {%- if loop.index0 > 0 -%}
223
+ {%- for j in range(loop.index0 - 1, -1, -1) -%}
224
+ {%- if not prev_nt.found -%}
225
+ {%- if loop_messages[j]['role'] != 'tool' -%}
226
+ {%- set prev_nt.role = loop_messages[j]['role'] -%}
227
+ {%- set prev_nt.found = true -%}
228
+ {%- endif -%}
229
+ {%- endif -%}
230
+ {%- endfor -%}
231
+ {%- endif -%}
232
+ {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
233
+ {%- if not continue_same_model_turn -%}
234
+ {{- '<|turn>' + role + '\n' }}
235
+ {%- endif -%}
236
+
237
+ {#- Render reasoning/reasoning_content as thinking channel -#}
238
+ {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
239
+ {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
240
+ {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
241
+ {%- endif -%}
242
+
243
+ {%- if message['tool_calls'] -%}
244
+ {%- for tool_call in message['tool_calls'] -%}
245
+ {%- set function = tool_call['function'] -%}
246
+ {{- '<|tool_call>call:' + function['name'] + '{' -}}
247
+ {%- if function['arguments'] is mapping -%}
248
+ {%- set ns_args = namespace(found_first=false) -%}
249
+ {%- for key, value in function['arguments'] | dictsort -%}
250
+ {%- if ns_args.found_first %},{% endif -%}
251
+ {%- set ns_args.found_first = true -%}
252
+ {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
253
+ {%- endfor -%}
254
+ {%- elif function['arguments'] is string -%}
255
+ {{- function['arguments'] -}}
256
+ {%- endif -%}
257
+ {{- '}<tool_call|>' -}}
258
+ {%- endfor -%}
259
+ {%- set ns.prev_message_type = 'tool_call' -%}
260
+ {%- endif -%}
261
+
262
+ {%- set ns_tr_out = namespace(flag=false) -%}
263
+ {%- if message.get('tool_responses') -%}
264
+ {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
265
+ {%- for tool_response in message['tool_responses'] -%}
266
+ {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
267
+ {%- set ns_tr_out.flag = true -%}
268
+ {%- set ns.prev_message_type = 'tool_response' -%}
269
+ {%- endfor -%}
270
+ {%- elif message.get('tool_calls') -%}
271
+ {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
272
+ {%- set ns_tool_scan = namespace(stopped=false) -%}
273
+ {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
274
+ {%- if ns_tool_scan.stopped -%}
275
+ {%- elif loop_messages[k]['role'] != 'tool' -%}
276
+ {%- set ns_tool_scan.stopped = true -%}
277
+ {%- else -%}
278
+ {%- set follow = loop_messages[k] -%}
279
+ {#- Resolve tool_call_id to function name -#}
280
+ {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
281
+ {%- for tc in message['tool_calls'] -%}
282
+ {%- if tc.get('id') == follow.get('tool_call_id') -%}
283
+ {%- set ns_tname.name = tc['function']['name'] -%}
284
+ {%- endif -%}
285
+ {%- endfor -%}
286
+ {#- Handle content as string or content-parts array -#}
287
+ {%- set tool_body = follow.get('content') -%}
288
+ {%- if tool_body is string -%}
289
+ {{- format_tool_response_block(ns_tname.name, tool_body) -}}
290
+ {%- elif tool_body is sequence and tool_body is not string -%}
291
+ {%- set ns_txt = namespace(s='') -%}
292
+ {%- for part in tool_body -%}
293
+ {%- if part.get('type') == 'text' -%}
294
+ {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
295
+ {%- endif -%}
296
+ {%- endfor -%}
297
+ {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
298
+ {%- else -%}
299
+ {{- format_tool_response_block(ns_tname.name, tool_body) -}}
300
+ {%- endif -%}
301
+ {%- set ns_tr_out.flag = true -%}
302
+ {%- set ns.prev_message_type = 'tool_response' -%}
303
+ {%- endif -%}
304
+ {%- endfor -%}
305
+ {%- endif -%}
306
+
307
+ {%- set captured_content -%}
308
+ {%- if message['content'] is string -%}
309
+ {%- if role == 'model' -%}
310
+ {{- strip_thinking(message['content']) -}}
311
+ {%- else -%}
312
+ {{- message['content'] | trim -}}
313
+ {%- endif -%}
314
+ {%- elif message['content'] is sequence -%}
315
+ {%- for item in message['content'] -%}
316
+ {%- if item['type'] == 'text' -%}
317
+ {%- if role == 'model' -%}
318
+ {{- strip_thinking(item['text']) -}}
319
+ {%- else -%}
320
+ {{- item['text'] | trim -}}
321
+ {%- endif -%}
322
+ {%- elif item['type'] == 'image' -%}
323
+ {{- '<|image|>' -}}
324
+ {%- set ns.prev_message_type = 'image' -%}
325
+ {%- elif item['type'] == 'audio' -%}
326
+ {{- '<|audio|>' -}}
327
+ {%- set ns.prev_message_type = 'audio' -%}
328
+ {%- elif item['type'] == 'video' -%}
329
+ {{- '<|video|>' -}}
330
+ {%- set ns.prev_message_type = 'video' -%}
331
+ {%- endif -%}
332
+ {%- endfor -%}
333
+ {%- endif -%}
334
+ {%- endset -%}
335
+
336
+ {{- captured_content -}}
337
+ {%- set has_content = captured_content | trim | length > 0 -%}
338
+
339
+ {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
340
+ {{- '<|tool_response>' -}}
341
+ {%- elif not (ns_tr_out.flag and not has_content) -%}
342
+ {{- '<turn|>\n' -}}
343
+ {%- endif -%}
344
+ {%- endif -%}
345
+ {%- endfor -%}
346
+
347
+ {%- if add_generation_prompt -%}
348
+ {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
349
+ {{- '<|turn>model\n' -}}
350
+ {%- endif -%}
351
+ {%- endif -%}
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff61320420b47c1e1b4581b4c53c3bd7326f48fca596c78a632bf5c087631369
3
+ size 206452109
processor_config.json ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "audio_ms_per_token": 40,
3
+ "audio_seq_length": 750,
4
+ "feature_extractor": {
5
+ "dither": 0.0,
6
+ "feature_extractor_type": "Gemma4AudioFeatureExtractor",
7
+ "feature_size": 128,
8
+ "fft_length": 512,
9
+ "fft_overdrive": false,
10
+ "frame_length": 320,
11
+ "hop_length": 160,
12
+ "input_scale_factor": 1.0,
13
+ "max_frequency": 8000.0,
14
+ "mel_floor": 0.001,
15
+ "min_frequency": 0.0,
16
+ "padding_side": "left",
17
+ "padding_value": 0.0,
18
+ "per_bin_mean": null,
19
+ "per_bin_stddev": null,
20
+ "preemphasis": 0.0,
21
+ "preemphasis_htk_flavor": true,
22
+ "return_attention_mask": true,
23
+ "sampling_rate": 16000
24
+ },
25
+ "image_processor": {
26
+ "do_convert_rgb": true,
27
+ "do_normalize": false,
28
+ "do_rescale": true,
29
+ "do_resize": true,
30
+ "image_mean": [
31
+ 0.0,
32
+ 0.0,
33
+ 0.0
34
+ ],
35
+ "image_processor_type": "Gemma4ImageProcessor",
36
+ "image_seq_length": 280,
37
+ "image_std": [
38
+ 1.0,
39
+ 1.0,
40
+ 1.0
41
+ ],
42
+ "max_soft_tokens": 280,
43
+ "patch_size": 16,
44
+ "pooling_kernel_size": 3,
45
+ "resample": 3,
46
+ "rescale_factor": 0.00392156862745098
47
+ },
48
+ "image_seq_length": 280,
49
+ "processor_class": "Gemma4Processor",
50
+ "video_processor": {
51
+ "do_convert_rgb": true,
52
+ "do_normalize": true,
53
+ "do_rescale": true,
54
+ "do_resize": true,
55
+ "do_sample_frames": true,
56
+ "image_mean": [
57
+ 0.0,
58
+ 0.0,
59
+ 0.0
60
+ ],
61
+ "image_std": [
62
+ 1.0,
63
+ 1.0,
64
+ 1.0
65
+ ],
66
+ "max_soft_tokens": 70,
67
+ "num_frames": 32,
68
+ "patch_size": 16,
69
+ "pooling_kernel_size": 3,
70
+ "resample": 3,
71
+ "rescale_factor": 0.00392156862745098,
72
+ "return_metadata": false,
73
+ "video_processor_type": "Gemma4VideoProcessor"
74
+ }
75
+ }
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19b15d59b7d2f507771b26ca44b1603b8cdd22ee0992f3217d7e17b763986855
3
+ size 14645
scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:55d5c1b55b1dc021b0d083fa48f376d4fac1639b18f7a58768d5d035271b9272
3
+ size 1383
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e04623d938a55a7867a6a097661fe33f50bb72d27aeb99e8d2b5b4a017d6cc3
3
+ size 1465
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
3
+ size 32169626
tokenizer_config.json ADDED
@@ -0,0 +1,289 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "audio_token": "<|audio|>",
3
+ "backend": "tokenizers",
4
+ "boa_token": "<|audio>",
5
+ "boi_token": "<|image>",
6
+ "bos_token": "<bos>",
7
+ "eoa_token": "<audio|>",
8
+ "eoc_token": "<channel|>",
9
+ "eoi_token": "<image|>",
10
+ "eos_token": "<eos>",
11
+ "eot_token": "<turn|>",
12
+ "escape_token": "<|\"|>",
13
+ "etc_token": "<tool_call|>",
14
+ "etd_token": "<tool|>",
15
+ "etr_token": "<tool_response|>",
16
+ "extra_special_tokens": [
17
+ "<|video|>"
18
+ ],
19
+ "image_token": "<|image|>",
20
+ "is_local": false,
21
+ "mask_token": "<mask>",
22
+ "model_max_length": 131072,
23
+ "model_specific_special_tokens": {
24
+ "audio_token": "<|audio|>",
25
+ "boa_token": "<|audio>",
26
+ "boi_token": "<|image>",
27
+ "eoa_token": "<audio|>",
28
+ "eoc_token": "<channel|>",
29
+ "eoi_token": "<image|>",
30
+ "eot_token": "<turn|>",
31
+ "escape_token": "<|\"|>",
32
+ "etc_token": "<tool_call|>",
33
+ "etd_token": "<tool|>",
34
+ "etr_token": "<tool_response|>",
35
+ "image_token": "<|image|>",
36
+ "soc_token": "<|channel>",
37
+ "sot_token": "<|turn>",
38
+ "stc_token": "<|tool_call>",
39
+ "std_token": "<|tool>",
40
+ "str_token": "<|tool_response>",
41
+ "think_token": "<|think|>"
42
+ },
43
+ "pad_token": "<pad>",
44
+ "padding_side": "right",
45
+ "processor_class": "Gemma4Processor",
46
+ "response_schema": {
47
+ "properties": {
48
+ "content": {
49
+ "type": "string"
50
+ },
51
+ "role": {
52
+ "const": "assistant"
53
+ },
54
+ "thinking": {
55
+ "type": "string"
56
+ },
57
+ "tool_calls": {
58
+ "items": {
59
+ "properties": {
60
+ "function": {
61
+ "properties": {
62
+ "arguments": {
63
+ "additionalProperties": {},
64
+ "type": "object",
65
+ "x-parser": "gemma4-tool-call"
66
+ },
67
+ "name": {
68
+ "type": "string"
69
+ }
70
+ },
71
+ "type": "object",
72
+ "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
73
+ },
74
+ "type": {
75
+ "const": "function"
76
+ }
77
+ },
78
+ "type": "object"
79
+ },
80
+ "type": "array",
81
+ "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
82
+ }
83
+ },
84
+ "type": "object",
85
+ "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
86
+ },
87
+ "soc_token": "<|channel>",
88
+ "sot_token": "<|turn>",
89
+ "stc_token": "<|tool_call>",
90
+ "std_token": "<|tool>",
91
+ "str_token": "<|tool_response>",
92
+ "think_token": "<|think|>",
93
+ "tokenizer_class": "GemmaTokenizer",
94
+ "unk_token": "<unk>",
95
+ "added_tokens_decoder": {
96
+ "0": {
97
+ "content": "<pad>",
98
+ "single_word": false,
99
+ "lstrip": false,
100
+ "rstrip": false,
101
+ "normalized": false,
102
+ "special": true
103
+ },
104
+ "1": {
105
+ "content": "<eos>",
106
+ "single_word": false,
107
+ "lstrip": false,
108
+ "rstrip": false,
109
+ "normalized": false,
110
+ "special": true
111
+ },
112
+ "2": {
113
+ "content": "<bos>",
114
+ "single_word": false,
115
+ "lstrip": false,
116
+ "rstrip": false,
117
+ "normalized": false,
118
+ "special": true
119
+ },
120
+ "3": {
121
+ "content": "<unk>",
122
+ "single_word": false,
123
+ "lstrip": false,
124
+ "rstrip": false,
125
+ "normalized": false,
126
+ "special": true
127
+ },
128
+ "4": {
129
+ "content": "<mask>",
130
+ "single_word": false,
131
+ "lstrip": false,
132
+ "rstrip": false,
133
+ "normalized": false,
134
+ "special": true
135
+ },
136
+ "46": {
137
+ "content": "<|tool>",
138
+ "single_word": false,
139
+ "lstrip": false,
140
+ "rstrip": false,
141
+ "normalized": false,
142
+ "special": true
143
+ },
144
+ "47": {
145
+ "content": "<tool|>",
146
+ "single_word": false,
147
+ "lstrip": false,
148
+ "rstrip": false,
149
+ "normalized": false,
150
+ "special": true
151
+ },
152
+ "48": {
153
+ "content": "<|tool_call>",
154
+ "single_word": false,
155
+ "lstrip": false,
156
+ "rstrip": false,
157
+ "normalized": false,
158
+ "special": true
159
+ },
160
+ "49": {
161
+ "content": "<tool_call|>",
162
+ "single_word": false,
163
+ "lstrip": false,
164
+ "rstrip": false,
165
+ "normalized": false,
166
+ "special": true
167
+ },
168
+ "50": {
169
+ "content": "<|tool_response>",
170
+ "single_word": false,
171
+ "lstrip": false,
172
+ "rstrip": false,
173
+ "normalized": false,
174
+ "special": true
175
+ },
176
+ "51": {
177
+ "content": "<tool_response|>",
178
+ "single_word": false,
179
+ "lstrip": false,
180
+ "rstrip": false,
181
+ "normalized": false,
182
+ "special": true
183
+ },
184
+ "52": {
185
+ "content": "<|\"|>",
186
+ "single_word": false,
187
+ "lstrip": false,
188
+ "rstrip": false,
189
+ "normalized": false,
190
+ "special": true
191
+ },
192
+ "98": {
193
+ "content": "<|think|>",
194
+ "single_word": false,
195
+ "lstrip": false,
196
+ "rstrip": false,
197
+ "normalized": false,
198
+ "special": true
199
+ },
200
+ "100": {
201
+ "content": "<|channel>",
202
+ "single_word": false,
203
+ "lstrip": false,
204
+ "rstrip": false,
205
+ "normalized": false,
206
+ "special": true
207
+ },
208
+ "101": {
209
+ "content": "<channel|>",
210
+ "single_word": false,
211
+ "lstrip": false,
212
+ "rstrip": false,
213
+ "normalized": false,
214
+ "special": true
215
+ },
216
+ "105": {
217
+ "content": "<|turn>",
218
+ "single_word": false,
219
+ "lstrip": false,
220
+ "rstrip": false,
221
+ "normalized": false,
222
+ "special": true
223
+ },
224
+ "106": {
225
+ "content": "<turn|>",
226
+ "single_word": false,
227
+ "lstrip": false,
228
+ "rstrip": false,
229
+ "normalized": false,
230
+ "special": true
231
+ },
232
+ "255999": {
233
+ "content": "<|image>",
234
+ "single_word": false,
235
+ "lstrip": false,
236
+ "rstrip": false,
237
+ "normalized": false,
238
+ "special": true
239
+ },
240
+ "256000": {
241
+ "content": "<|audio>",
242
+ "single_word": false,
243
+ "lstrip": false,
244
+ "rstrip": false,
245
+ "normalized": false,
246
+ "special": true
247
+ },
248
+ "258880": {
249
+ "content": "<|image|>",
250
+ "single_word": false,
251
+ "lstrip": false,
252
+ "rstrip": false,
253
+ "normalized": false,
254
+ "special": true
255
+ },
256
+ "258881": {
257
+ "content": "<|audio|>",
258
+ "single_word": false,
259
+ "lstrip": false,
260
+ "rstrip": false,
261
+ "normalized": false,
262
+ "special": true
263
+ },
264
+ "258882": {
265
+ "content": "<image|>",
266
+ "single_word": false,
267
+ "lstrip": false,
268
+ "rstrip": false,
269
+ "normalized": false,
270
+ "special": true
271
+ },
272
+ "258883": {
273
+ "content": "<audio|>",
274
+ "single_word": false,
275
+ "lstrip": false,
276
+ "rstrip": false,
277
+ "normalized": false,
278
+ "special": true
279
+ },
280
+ "258884": {
281
+ "content": "<|video|>",
282
+ "single_word": false,
283
+ "lstrip": false,
284
+ "rstrip": false,
285
+ "normalized": false,
286
+ "special": true
287
+ }
288
+ }
289
+ }
trainer_state.json ADDED
@@ -0,0 +1,2506 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.935483870967742,
6
+ "eval_steps": 30,
7
+ "global_step": 240,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.008064516129032258,
14
+ "grad_norm": NaN,
15
+ "learning_rate": 0.0,
16
+ "loss": 3.4369091987609863,
17
+ "num_input_tokens_seen": 3376,
18
+ "step": 1,
19
+ "train_runtime": 39.4836,
20
+ "train_tokens_per_second": 85.504
21
+ },
22
+ {
23
+ "epoch": 0.016129032258064516,
24
+ "grad_norm": 26.560325622558594,
25
+ "learning_rate": 0.0,
26
+ "loss": 3.346249580383301,
27
+ "num_input_tokens_seen": 6750,
28
+ "step": 2,
29
+ "train_runtime": 45.351,
30
+ "train_tokens_per_second": 148.839
31
+ },
32
+ {
33
+ "epoch": 0.024193548387096774,
34
+ "grad_norm": 26.696680068969727,
35
+ "learning_rate": 4e-05,
36
+ "loss": 3.3645849227905273,
37
+ "num_input_tokens_seen": 10132,
38
+ "step": 3,
39
+ "train_runtime": 50.5353,
40
+ "train_tokens_per_second": 200.493
41
+ },
42
+ {
43
+ "epoch": 0.03225806451612903,
44
+ "grad_norm": 11.073655128479004,
45
+ "learning_rate": 8e-05,
46
+ "loss": 2.3441145420074463,
47
+ "num_input_tokens_seen": 13496,
48
+ "step": 4,
49
+ "train_runtime": 55.9224,
50
+ "train_tokens_per_second": 241.334
51
+ },
52
+ {
53
+ "epoch": 0.04032258064516129,
54
+ "grad_norm": 5.371010780334473,
55
+ "learning_rate": 0.00012,
56
+ "loss": 1.8813656568527222,
57
+ "num_input_tokens_seen": 16978,
58
+ "step": 5,
59
+ "train_runtime": 61.2789,
60
+ "train_tokens_per_second": 277.061
61
+ },
62
+ {
63
+ "epoch": 0.04838709677419355,
64
+ "grad_norm": 2.7816290855407715,
65
+ "learning_rate": 0.00016,
66
+ "loss": 1.4953795671463013,
67
+ "num_input_tokens_seen": 20170,
68
+ "step": 6,
69
+ "train_runtime": 66.1452,
70
+ "train_tokens_per_second": 304.935
71
+ },
72
+ {
73
+ "epoch": 0.056451612903225805,
74
+ "grad_norm": 2.539138078689575,
75
+ "learning_rate": 0.0002,
76
+ "loss": 1.3141508102416992,
77
+ "num_input_tokens_seen": 23566,
78
+ "step": 7,
79
+ "train_runtime": 71.1819,
80
+ "train_tokens_per_second": 331.067
81
+ },
82
+ {
83
+ "epoch": 0.06451612903225806,
84
+ "grad_norm": 2.33978009223938,
85
+ "learning_rate": 0.00019999967645432384,
86
+ "loss": 1.2497824430465698,
87
+ "num_input_tokens_seen": 26930,
88
+ "step": 8,
89
+ "train_runtime": 75.9867,
90
+ "train_tokens_per_second": 354.404
91
+ },
92
+ {
93
+ "epoch": 0.07258064516129033,
94
+ "grad_norm": 1.5424915552139282,
95
+ "learning_rate": 0.00019999870581938894,
96
+ "loss": 1.093822956085205,
97
+ "num_input_tokens_seen": 30108,
98
+ "step": 9,
99
+ "train_runtime": 80.6039,
100
+ "train_tokens_per_second": 373.53
101
+ },
102
+ {
103
+ "epoch": 0.08064516129032258,
104
+ "grad_norm": 1.7023719549179077,
105
+ "learning_rate": 0.0001999970881014762,
106
+ "loss": 1.075514793395996,
107
+ "num_input_tokens_seen": 33254,
108
+ "step": 10,
109
+ "train_runtime": 84.9993,
110
+ "train_tokens_per_second": 391.227
111
+ },
112
+ {
113
+ "epoch": 0.08870967741935484,
114
+ "grad_norm": 1.1276626586914062,
115
+ "learning_rate": 0.00019999482331105377,
116
+ "loss": 1.0134414434432983,
117
+ "num_input_tokens_seen": 36458,
118
+ "step": 11,
119
+ "train_runtime": 89.4147,
120
+ "train_tokens_per_second": 407.74
121
+ },
122
+ {
123
+ "epoch": 0.0967741935483871,
124
+ "grad_norm": 1.089064598083496,
125
+ "learning_rate": 0.0001999919114627769,
126
+ "loss": 1.0177619457244873,
127
+ "num_input_tokens_seen": 39848,
128
+ "step": 12,
129
+ "train_runtime": 94.1911,
130
+ "train_tokens_per_second": 423.055
131
+ },
132
+ {
133
+ "epoch": 0.10483870967741936,
134
+ "grad_norm": 0.8842246532440186,
135
+ "learning_rate": 0.00019998835257548786,
136
+ "loss": 0.9782118797302246,
137
+ "num_input_tokens_seen": 42964,
138
+ "step": 13,
139
+ "train_runtime": 98.5173,
140
+ "train_tokens_per_second": 436.106
141
+ },
142
+ {
143
+ "epoch": 0.11290322580645161,
144
+ "grad_norm": 0.7232466340065002,
145
+ "learning_rate": 0.00019998414667221596,
146
+ "loss": 0.895933985710144,
147
+ "num_input_tokens_seen": 46336,
148
+ "step": 14,
149
+ "train_runtime": 103.1565,
150
+ "train_tokens_per_second": 449.182
151
+ },
152
+ {
153
+ "epoch": 0.12096774193548387,
154
+ "grad_norm": 0.7355626225471497,
155
+ "learning_rate": 0.00019997929378017725,
156
+ "loss": 0.9221972227096558,
157
+ "num_input_tokens_seen": 49524,
158
+ "step": 15,
159
+ "train_runtime": 107.7259,
160
+ "train_tokens_per_second": 459.723
161
+ },
162
+ {
163
+ "epoch": 0.12903225806451613,
164
+ "grad_norm": 0.7629883289337158,
165
+ "learning_rate": 0.00019997379393077428,
166
+ "loss": 0.884993314743042,
167
+ "num_input_tokens_seen": 52878,
168
+ "step": 16,
169
+ "train_runtime": 112.3676,
170
+ "train_tokens_per_second": 470.58
171
+ },
172
+ {
173
+ "epoch": 0.13709677419354838,
174
+ "grad_norm": 0.7517801523208618,
175
+ "learning_rate": 0.00019996764715959618,
176
+ "loss": 0.8044445514678955,
177
+ "num_input_tokens_seen": 56342,
178
+ "step": 17,
179
+ "train_runtime": 117.2539,
180
+ "train_tokens_per_second": 480.513
181
+ },
182
+ {
183
+ "epoch": 0.14516129032258066,
184
+ "grad_norm": 0.7528966665267944,
185
+ "learning_rate": 0.0001999608535064182,
186
+ "loss": 0.8287011981010437,
187
+ "num_input_tokens_seen": 59734,
188
+ "step": 18,
189
+ "train_runtime": 122.2813,
190
+ "train_tokens_per_second": 488.497
191
+ },
192
+ {
193
+ "epoch": 0.1532258064516129,
194
+ "grad_norm": 1.0456326007843018,
195
+ "learning_rate": 0.0001999534130152014,
196
+ "loss": 0.8226615190505981,
197
+ "num_input_tokens_seen": 63022,
198
+ "step": 19,
199
+ "train_runtime": 127.0278,
200
+ "train_tokens_per_second": 496.128
201
+ },
202
+ {
203
+ "epoch": 0.16129032258064516,
204
+ "grad_norm": 0.9146727323532104,
205
+ "learning_rate": 0.00019994532573409262,
206
+ "loss": 0.8422068357467651,
207
+ "num_input_tokens_seen": 66380,
208
+ "step": 20,
209
+ "train_runtime": 131.9068,
210
+ "train_tokens_per_second": 503.234
211
+ },
212
+ {
213
+ "epoch": 0.1693548387096774,
214
+ "grad_norm": 0.8606871962547302,
215
+ "learning_rate": 0.0001999365917154239,
216
+ "loss": 0.8119312524795532,
217
+ "num_input_tokens_seen": 69412,
218
+ "step": 21,
219
+ "train_runtime": 136.3606,
220
+ "train_tokens_per_second": 509.033
221
+ },
222
+ {
223
+ "epoch": 0.1774193548387097,
224
+ "grad_norm": 0.7490668296813965,
225
+ "learning_rate": 0.00019992721101571236,
226
+ "loss": 0.8383636474609375,
227
+ "num_input_tokens_seen": 72682,
228
+ "step": 22,
229
+ "train_runtime": 141.0773,
230
+ "train_tokens_per_second": 515.193
231
+ },
232
+ {
233
+ "epoch": 0.18548387096774194,
234
+ "grad_norm": 0.7244526743888855,
235
+ "learning_rate": 0.0001999171836956597,
236
+ "loss": 0.8476752638816833,
237
+ "num_input_tokens_seen": 75880,
238
+ "step": 23,
239
+ "train_runtime": 145.8525,
240
+ "train_tokens_per_second": 520.252
241
+ },
242
+ {
243
+ "epoch": 0.1935483870967742,
244
+ "grad_norm": 0.7046738266944885,
245
+ "learning_rate": 0.0001999065098201518,
246
+ "loss": 0.7803716063499451,
247
+ "num_input_tokens_seen": 79364,
248
+ "step": 24,
249
+ "train_runtime": 150.8071,
250
+ "train_tokens_per_second": 526.262
251
+ },
252
+ {
253
+ "epoch": 0.20161290322580644,
254
+ "grad_norm": 0.6511566042900085,
255
+ "learning_rate": 0.00019989518945825844,
256
+ "loss": 0.7314240336418152,
257
+ "num_input_tokens_seen": 82600,
258
+ "step": 25,
259
+ "train_runtime": 155.6778,
260
+ "train_tokens_per_second": 530.583
261
+ },
262
+ {
263
+ "epoch": 0.20967741935483872,
264
+ "grad_norm": 0.7805576324462891,
265
+ "learning_rate": 0.00019988322268323268,
266
+ "loss": 0.7415602207183838,
267
+ "num_input_tokens_seen": 85954,
268
+ "step": 26,
269
+ "train_runtime": 160.5146,
270
+ "train_tokens_per_second": 535.49
271
+ },
272
+ {
273
+ "epoch": 0.21774193548387097,
274
+ "grad_norm": 0.7170730829238892,
275
+ "learning_rate": 0.00019987060957251047,
276
+ "loss": 0.7152543663978577,
277
+ "num_input_tokens_seen": 89314,
278
+ "step": 27,
279
+ "train_runtime": 165.2621,
280
+ "train_tokens_per_second": 540.438
281
+ },
282
+ {
283
+ "epoch": 0.22580645161290322,
284
+ "grad_norm": 0.7345016002655029,
285
+ "learning_rate": 0.00019985735020771017,
286
+ "loss": 0.7394841909408569,
287
+ "num_input_tokens_seen": 92550,
288
+ "step": 28,
289
+ "train_runtime": 169.9598,
290
+ "train_tokens_per_second": 544.54
291
+ },
292
+ {
293
+ "epoch": 0.23387096774193547,
294
+ "grad_norm": 0.6704856157302856,
295
+ "learning_rate": 0.00019984344467463197,
296
+ "loss": 0.6681033372879028,
297
+ "num_input_tokens_seen": 95904,
298
+ "step": 29,
299
+ "train_runtime": 174.6804,
300
+ "train_tokens_per_second": 549.026
301
+ },
302
+ {
303
+ "epoch": 0.24193548387096775,
304
+ "grad_norm": 0.6933154463768005,
305
+ "learning_rate": 0.0001998288930632574,
306
+ "loss": 0.7358725070953369,
307
+ "num_input_tokens_seen": 99314,
308
+ "step": 30,
309
+ "train_runtime": 179.518,
310
+ "train_tokens_per_second": 553.226
311
+ },
312
+ {
313
+ "epoch": 0.24193548387096775,
314
+ "eval_loss": 3.1636757850646973,
315
+ "eval_runtime": 13.9458,
316
+ "eval_samples_per_second": 3.729,
317
+ "eval_steps_per_second": 1.864,
318
+ "num_input_tokens_seen": 99314,
319
+ "step": 30
320
+ },
321
+ {
322
+ "epoch": 0.25,
323
+ "grad_norm": 0.6650590300559998,
324
+ "learning_rate": 0.00019981369546774865,
325
+ "loss": 0.7312928438186646,
326
+ "num_input_tokens_seen": 102632,
327
+ "step": 31,
328
+ "train_runtime": 234.0652,
329
+ "train_tokens_per_second": 438.476
330
+ },
331
+ {
332
+ "epoch": 0.25806451612903225,
333
+ "grad_norm": 0.6843596696853638,
334
+ "learning_rate": 0.00019979785198644806,
335
+ "loss": 0.7392816543579102,
336
+ "num_input_tokens_seen": 106032,
337
+ "step": 32,
338
+ "train_runtime": 238.7402,
339
+ "train_tokens_per_second": 444.131
340
+ },
341
+ {
342
+ "epoch": 0.2661290322580645,
343
+ "grad_norm": 0.6795448660850525,
344
+ "learning_rate": 0.00019978136272187747,
345
+ "loss": 0.6713091135025024,
346
+ "num_input_tokens_seen": 109374,
347
+ "step": 33,
348
+ "train_runtime": 243.5988,
349
+ "train_tokens_per_second": 448.992
350
+ },
351
+ {
352
+ "epoch": 0.27419354838709675,
353
+ "grad_norm": 0.6419102549552917,
354
+ "learning_rate": 0.0001997642277807374,
355
+ "loss": 0.7344367504119873,
356
+ "num_input_tokens_seen": 112780,
357
+ "step": 34,
358
+ "train_runtime": 248.5335,
359
+ "train_tokens_per_second": 453.782
360
+ },
361
+ {
362
+ "epoch": 0.28225806451612906,
363
+ "grad_norm": 0.7528514862060547,
364
+ "learning_rate": 0.00019974644727390665,
365
+ "loss": 0.6826475262641907,
366
+ "num_input_tokens_seen": 116154,
367
+ "step": 35,
368
+ "train_runtime": 253.6613,
369
+ "train_tokens_per_second": 457.91
370
+ },
371
+ {
372
+ "epoch": 0.2903225806451613,
373
+ "grad_norm": 0.7251984477043152,
374
+ "learning_rate": 0.00019972802131644127,
375
+ "loss": 0.702574610710144,
376
+ "num_input_tokens_seen": 119350,
377
+ "step": 36,
378
+ "train_runtime": 258.505,
379
+ "train_tokens_per_second": 461.693
380
+ },
381
+ {
382
+ "epoch": 0.29838709677419356,
383
+ "grad_norm": 0.7205802202224731,
384
+ "learning_rate": 0.00019970895002757413,
385
+ "loss": 0.7223511934280396,
386
+ "num_input_tokens_seen": 122566,
387
+ "step": 37,
388
+ "train_runtime": 263.411,
389
+ "train_tokens_per_second": 465.303
390
+ },
391
+ {
392
+ "epoch": 0.3064516129032258,
393
+ "grad_norm": 0.6758707165718079,
394
+ "learning_rate": 0.00019968923353071377,
395
+ "loss": 0.6775951981544495,
396
+ "num_input_tokens_seen": 125906,
397
+ "step": 38,
398
+ "train_runtime": 268.6516,
399
+ "train_tokens_per_second": 468.659
400
+ },
401
+ {
402
+ "epoch": 0.31451612903225806,
403
+ "grad_norm": 0.7149205207824707,
404
+ "learning_rate": 0.00019966887195344403,
405
+ "loss": 0.6415547132492065,
406
+ "num_input_tokens_seen": 129178,
407
+ "step": 39,
408
+ "train_runtime": 273.5735,
409
+ "train_tokens_per_second": 472.188
410
+ },
411
+ {
412
+ "epoch": 0.3225806451612903,
413
+ "grad_norm": 0.8233773708343506,
414
+ "learning_rate": 0.0001996478654275229,
415
+ "loss": 0.6871565580368042,
416
+ "num_input_tokens_seen": 132242,
417
+ "step": 40,
418
+ "train_runtime": 278.1257,
419
+ "train_tokens_per_second": 475.476
420
+ },
421
+ {
422
+ "epoch": 0.33064516129032256,
423
+ "grad_norm": 0.603967010974884,
424
+ "learning_rate": 0.00019962621408888177,
425
+ "loss": 0.6418657302856445,
426
+ "num_input_tokens_seen": 135598,
427
+ "step": 41,
428
+ "train_runtime": 283.0503,
429
+ "train_tokens_per_second": 479.06
430
+ },
431
+ {
432
+ "epoch": 0.3387096774193548,
433
+ "grad_norm": 0.6170377731323242,
434
+ "learning_rate": 0.00019960391807762463,
435
+ "loss": 0.6446406245231628,
436
+ "num_input_tokens_seen": 139036,
437
+ "step": 42,
438
+ "train_runtime": 287.8831,
439
+ "train_tokens_per_second": 482.96
440
+ },
441
+ {
442
+ "epoch": 0.3467741935483871,
443
+ "grad_norm": 0.6677550077438354,
444
+ "learning_rate": 0.00019958097753802693,
445
+ "loss": 0.6513428092002869,
446
+ "num_input_tokens_seen": 142314,
447
+ "step": 43,
448
+ "train_runtime": 292.6153,
449
+ "train_tokens_per_second": 486.352
450
+ },
451
+ {
452
+ "epoch": 0.3548387096774194,
453
+ "grad_norm": 0.5713569521903992,
454
+ "learning_rate": 0.00019955739261853504,
455
+ "loss": 0.6523911952972412,
456
+ "num_input_tokens_seen": 145730,
457
+ "step": 44,
458
+ "train_runtime": 297.3525,
459
+ "train_tokens_per_second": 490.092
460
+ },
461
+ {
462
+ "epoch": 0.3629032258064516,
463
+ "grad_norm": 0.6360762119293213,
464
+ "learning_rate": 0.00019953316347176488,
465
+ "loss": 0.6005333662033081,
466
+ "num_input_tokens_seen": 148950,
467
+ "step": 45,
468
+ "train_runtime": 301.7634,
469
+ "train_tokens_per_second": 493.599
470
+ },
471
+ {
472
+ "epoch": 0.3709677419354839,
473
+ "grad_norm": 0.5776824355125427,
474
+ "learning_rate": 0.00019950829025450114,
475
+ "loss": 0.6482651233673096,
476
+ "num_input_tokens_seen": 152208,
477
+ "step": 46,
478
+ "train_runtime": 306.4442,
479
+ "train_tokens_per_second": 496.691
480
+ },
481
+ {
482
+ "epoch": 0.3790322580645161,
483
+ "grad_norm": 0.5635464787483215,
484
+ "learning_rate": 0.0001994827731276963,
485
+ "loss": 0.5292215943336487,
486
+ "num_input_tokens_seen": 155494,
487
+ "step": 47,
488
+ "train_runtime": 310.97,
489
+ "train_tokens_per_second": 500.029
490
+ },
491
+ {
492
+ "epoch": 0.3870967741935484,
493
+ "grad_norm": 0.7464826107025146,
494
+ "learning_rate": 0.00019945661225646946,
495
+ "loss": 0.5896004438400269,
496
+ "num_input_tokens_seen": 158822,
497
+ "step": 48,
498
+ "train_runtime": 315.5591,
499
+ "train_tokens_per_second": 503.304
500
+ },
501
+ {
502
+ "epoch": 0.3951612903225806,
503
+ "grad_norm": 0.6741185784339905,
504
+ "learning_rate": 0.0001994298078101054,
505
+ "loss": 0.610090970993042,
506
+ "num_input_tokens_seen": 162234,
507
+ "step": 49,
508
+ "train_runtime": 320.3691,
509
+ "train_tokens_per_second": 506.397
510
+ },
511
+ {
512
+ "epoch": 0.4032258064516129,
513
+ "grad_norm": 0.8294225931167603,
514
+ "learning_rate": 0.00019940235996205333,
515
+ "loss": 0.7023945450782776,
516
+ "num_input_tokens_seen": 165674,
517
+ "step": 50,
518
+ "train_runtime": 325.0853,
519
+ "train_tokens_per_second": 509.632
520
+ },
521
+ {
522
+ "epoch": 0.4112903225806452,
523
+ "grad_norm": 0.6233593821525574,
524
+ "learning_rate": 0.0001993742688899259,
525
+ "loss": 0.5964958667755127,
526
+ "num_input_tokens_seen": 169012,
527
+ "step": 51,
528
+ "train_runtime": 329.9005,
529
+ "train_tokens_per_second": 512.312
530
+ },
531
+ {
532
+ "epoch": 0.41935483870967744,
533
+ "grad_norm": 0.6552059054374695,
534
+ "learning_rate": 0.00019934553477549794,
535
+ "loss": 0.5970840454101562,
536
+ "num_input_tokens_seen": 172396,
537
+ "step": 52,
538
+ "train_runtime": 334.6436,
539
+ "train_tokens_per_second": 515.163
540
+ },
541
+ {
542
+ "epoch": 0.4274193548387097,
543
+ "grad_norm": 0.5657429695129395,
544
+ "learning_rate": 0.00019931615780470558,
545
+ "loss": 0.6213586330413818,
546
+ "num_input_tokens_seen": 175766,
547
+ "step": 53,
548
+ "train_runtime": 339.4928,
549
+ "train_tokens_per_second": 517.731
550
+ },
551
+ {
552
+ "epoch": 0.43548387096774194,
553
+ "grad_norm": 0.6272680163383484,
554
+ "learning_rate": 0.00019928613816764458,
555
+ "loss": 0.5776374340057373,
556
+ "num_input_tokens_seen": 179034,
557
+ "step": 54,
558
+ "train_runtime": 344.3701,
559
+ "train_tokens_per_second": 519.888
560
+ },
561
+ {
562
+ "epoch": 0.4435483870967742,
563
+ "grad_norm": 0.5951282382011414,
564
+ "learning_rate": 0.00019925547605856934,
565
+ "loss": 0.6662795543670654,
566
+ "num_input_tokens_seen": 182380,
567
+ "step": 55,
568
+ "train_runtime": 349.2001,
569
+ "train_tokens_per_second": 522.279
570
+ },
571
+ {
572
+ "epoch": 0.45161290322580644,
573
+ "grad_norm": 0.6823417544364929,
574
+ "learning_rate": 0.00019922417167589183,
575
+ "loss": 0.6379531621932983,
576
+ "num_input_tokens_seen": 185574,
577
+ "step": 56,
578
+ "train_runtime": 353.9778,
579
+ "train_tokens_per_second": 524.253
580
+ },
581
+ {
582
+ "epoch": 0.4596774193548387,
583
+ "grad_norm": 0.5668640732765198,
584
+ "learning_rate": 0.00019919222522217996,
585
+ "loss": 0.6385904550552368,
586
+ "num_input_tokens_seen": 188970,
587
+ "step": 57,
588
+ "train_runtime": 358.9666,
589
+ "train_tokens_per_second": 526.428
590
+ },
591
+ {
592
+ "epoch": 0.46774193548387094,
593
+ "grad_norm": 0.5809326171875,
594
+ "learning_rate": 0.00019915963690415647,
595
+ "loss": 0.5536263585090637,
596
+ "num_input_tokens_seen": 192282,
597
+ "step": 58,
598
+ "train_runtime": 363.7121,
599
+ "train_tokens_per_second": 528.665
600
+ },
601
+ {
602
+ "epoch": 0.47580645161290325,
603
+ "grad_norm": 0.6727699041366577,
604
+ "learning_rate": 0.00019912640693269752,
605
+ "loss": 0.6002112627029419,
606
+ "num_input_tokens_seen": 195644,
607
+ "step": 59,
608
+ "train_runtime": 368.6469,
609
+ "train_tokens_per_second": 530.708
610
+ },
611
+ {
612
+ "epoch": 0.4838709677419355,
613
+ "grad_norm": 0.6097072958946228,
614
+ "learning_rate": 0.00019909253552283143,
615
+ "loss": 0.611109733581543,
616
+ "num_input_tokens_seen": 198956,
617
+ "step": 60,
618
+ "train_runtime": 373.378,
619
+ "train_tokens_per_second": 532.854
620
+ },
621
+ {
622
+ "epoch": 0.4838709677419355,
623
+ "eval_loss": 3.0930163860321045,
624
+ "eval_runtime": 12.3352,
625
+ "eval_samples_per_second": 4.216,
626
+ "eval_steps_per_second": 2.108,
627
+ "num_input_tokens_seen": 198956,
628
+ "step": 60
629
+ },
630
+ {
631
+ "epoch": 0.49193548387096775,
632
+ "grad_norm": 0.608394205570221,
633
+ "learning_rate": 0.00019905802289373715,
634
+ "loss": 0.5526689887046814,
635
+ "num_input_tokens_seen": 202314,
636
+ "step": 61,
637
+ "train_runtime": 405.9522,
638
+ "train_tokens_per_second": 498.369
639
+ },
640
+ {
641
+ "epoch": 0.5,
642
+ "grad_norm": 0.6111866235733032,
643
+ "learning_rate": 0.0001990228692687429,
644
+ "loss": 0.5586686730384827,
645
+ "num_input_tokens_seen": 205722,
646
+ "step": 62,
647
+ "train_runtime": 410.5864,
648
+ "train_tokens_per_second": 501.044
649
+ },
650
+ {
651
+ "epoch": 0.5080645161290323,
652
+ "grad_norm": 0.6161165237426758,
653
+ "learning_rate": 0.00019898707487532474,
654
+ "loss": 0.641068160533905,
655
+ "num_input_tokens_seen": 208882,
656
+ "step": 63,
657
+ "train_runtime": 415.1353,
658
+ "train_tokens_per_second": 503.166
659
+ },
660
+ {
661
+ "epoch": 0.5161290322580645,
662
+ "grad_norm": 0.6087555289268494,
663
+ "learning_rate": 0.0001989506399451051,
664
+ "loss": 0.5811461210250854,
665
+ "num_input_tokens_seen": 212198,
666
+ "step": 64,
667
+ "train_runtime": 420.1346,
668
+ "train_tokens_per_second": 505.071
669
+ },
670
+ {
671
+ "epoch": 0.5241935483870968,
672
+ "grad_norm": 0.6409975290298462,
673
+ "learning_rate": 0.0001989135647138513,
674
+ "loss": 0.6092388033866882,
675
+ "num_input_tokens_seen": 215586,
676
+ "step": 65,
677
+ "train_runtime": 425.1826,
678
+ "train_tokens_per_second": 507.043
679
+ },
680
+ {
681
+ "epoch": 0.532258064516129,
682
+ "grad_norm": 0.7127669453620911,
683
+ "learning_rate": 0.00019887584942147394,
684
+ "loss": 0.6781883239746094,
685
+ "num_input_tokens_seen": 218732,
686
+ "step": 66,
687
+ "train_runtime": 430.1434,
688
+ "train_tokens_per_second": 508.509
689
+ },
690
+ {
691
+ "epoch": 0.5403225806451613,
692
+ "grad_norm": 0.6341464519500732,
693
+ "learning_rate": 0.0001988374943120254,
694
+ "loss": 0.6367583274841309,
695
+ "num_input_tokens_seen": 221938,
696
+ "step": 67,
697
+ "train_runtime": 435.098,
698
+ "train_tokens_per_second": 510.087
699
+ },
700
+ {
701
+ "epoch": 0.5483870967741935,
702
+ "grad_norm": 0.6676456332206726,
703
+ "learning_rate": 0.00019879849963369827,
704
+ "loss": 0.5986863970756531,
705
+ "num_input_tokens_seen": 225362,
706
+ "step": 68,
707
+ "train_runtime": 440.4076,
708
+ "train_tokens_per_second": 511.712
709
+ },
710
+ {
711
+ "epoch": 0.5564516129032258,
712
+ "grad_norm": 0.5387278199195862,
713
+ "learning_rate": 0.00019875886563882375,
714
+ "loss": 0.5684967637062073,
715
+ "num_input_tokens_seen": 228656,
716
+ "step": 69,
717
+ "train_runtime": 445.6788,
718
+ "train_tokens_per_second": 513.051
719
+ },
720
+ {
721
+ "epoch": 0.5645161290322581,
722
+ "grad_norm": 0.5598258972167969,
723
+ "learning_rate": 0.00019871859258387,
724
+ "loss": 0.5872923135757446,
725
+ "num_input_tokens_seen": 232052,
726
+ "step": 70,
727
+ "train_runtime": 450.5989,
728
+ "train_tokens_per_second": 514.986
729
+ },
730
+ {
731
+ "epoch": 0.5725806451612904,
732
+ "grad_norm": 0.5795265436172485,
733
+ "learning_rate": 0.00019867768072944045,
734
+ "loss": 0.5454657673835754,
735
+ "num_input_tokens_seen": 235430,
736
+ "step": 71,
737
+ "train_runtime": 455.462,
738
+ "train_tokens_per_second": 516.904
739
+ },
740
+ {
741
+ "epoch": 0.5806451612903226,
742
+ "grad_norm": 0.6275506019592285,
743
+ "learning_rate": 0.00019863613034027224,
744
+ "loss": 0.5985739827156067,
745
+ "num_input_tokens_seen": 238792,
746
+ "step": 72,
747
+ "train_runtime": 460.1637,
748
+ "train_tokens_per_second": 518.928
749
+ },
750
+ {
751
+ "epoch": 0.5887096774193549,
752
+ "grad_norm": 0.6417560577392578,
753
+ "learning_rate": 0.0001985939416852343,
754
+ "loss": 0.6151498556137085,
755
+ "num_input_tokens_seen": 242136,
756
+ "step": 73,
757
+ "train_runtime": 464.8752,
758
+ "train_tokens_per_second": 520.862
759
+ },
760
+ {
761
+ "epoch": 0.5967741935483871,
762
+ "grad_norm": 0.6558713912963867,
763
+ "learning_rate": 0.00019855111503732574,
764
+ "loss": 0.6127005219459534,
765
+ "num_input_tokens_seen": 245502,
766
+ "step": 74,
767
+ "train_runtime": 469.6002,
768
+ "train_tokens_per_second": 522.789
769
+ },
770
+ {
771
+ "epoch": 0.6048387096774194,
772
+ "grad_norm": 0.5617387294769287,
773
+ "learning_rate": 0.00019850765067367412,
774
+ "loss": 0.5805050730705261,
775
+ "num_input_tokens_seen": 248724,
776
+ "step": 75,
777
+ "train_runtime": 473.9894,
778
+ "train_tokens_per_second": 524.746
779
+ },
780
+ {
781
+ "epoch": 0.6129032258064516,
782
+ "grad_norm": 0.5580056309700012,
783
+ "learning_rate": 0.00019846354887553358,
784
+ "loss": 0.5755968689918518,
785
+ "num_input_tokens_seen": 252068,
786
+ "step": 76,
787
+ "train_runtime": 478.6533,
788
+ "train_tokens_per_second": 526.619
789
+ },
790
+ {
791
+ "epoch": 0.6209677419354839,
792
+ "grad_norm": 0.5987703204154968,
793
+ "learning_rate": 0.00019841880992828306,
794
+ "loss": 0.5941160321235657,
795
+ "num_input_tokens_seen": 255328,
796
+ "step": 77,
797
+ "train_runtime": 483.1368,
798
+ "train_tokens_per_second": 528.48
799
+ },
800
+ {
801
+ "epoch": 0.6290322580645161,
802
+ "grad_norm": 0.5982978940010071,
803
+ "learning_rate": 0.0001983734341214244,
804
+ "loss": 0.5437498688697815,
805
+ "num_input_tokens_seen": 258440,
806
+ "step": 78,
807
+ "train_runtime": 487.3985,
808
+ "train_tokens_per_second": 530.244
809
+ },
810
+ {
811
+ "epoch": 0.6370967741935484,
812
+ "grad_norm": 0.6150534749031067,
813
+ "learning_rate": 0.00019832742174858052,
814
+ "loss": 0.4968106150627136,
815
+ "num_input_tokens_seen": 261670,
816
+ "step": 79,
817
+ "train_runtime": 492.0778,
818
+ "train_tokens_per_second": 531.765
819
+ },
820
+ {
821
+ "epoch": 0.6451612903225806,
822
+ "grad_norm": 0.6039842367172241,
823
+ "learning_rate": 0.0001982807731074935,
824
+ "loss": 0.5774258375167847,
825
+ "num_input_tokens_seen": 265042,
826
+ "step": 80,
827
+ "train_runtime": 496.8154,
828
+ "train_tokens_per_second": 533.482
829
+ },
830
+ {
831
+ "epoch": 0.6532258064516129,
832
+ "grad_norm": 0.6645222902297974,
833
+ "learning_rate": 0.00019823348850002268,
834
+ "loss": 0.5437123775482178,
835
+ "num_input_tokens_seen": 268396,
836
+ "step": 81,
837
+ "train_runtime": 501.4894,
838
+ "train_tokens_per_second": 535.198
839
+ },
840
+ {
841
+ "epoch": 0.6612903225806451,
842
+ "grad_norm": 0.6312376856803894,
843
+ "learning_rate": 0.00019818556823214268,
844
+ "loss": 0.5945921540260315,
845
+ "num_input_tokens_seen": 271554,
846
+ "step": 82,
847
+ "train_runtime": 506.2627,
848
+ "train_tokens_per_second": 536.389
849
+ },
850
+ {
851
+ "epoch": 0.6693548387096774,
852
+ "grad_norm": 0.7044872641563416,
853
+ "learning_rate": 0.00019813701261394136,
854
+ "loss": 0.6143019199371338,
855
+ "num_input_tokens_seen": 274730,
856
+ "step": 83,
857
+ "train_runtime": 510.8028,
858
+ "train_tokens_per_second": 537.84
859
+ },
860
+ {
861
+ "epoch": 0.6774193548387096,
862
+ "grad_norm": 0.5553087592124939,
863
+ "learning_rate": 0.00019808782195961797,
864
+ "loss": 0.5542465448379517,
865
+ "num_input_tokens_seen": 278100,
866
+ "step": 84,
867
+ "train_runtime": 515.6576,
868
+ "train_tokens_per_second": 539.311
869
+ },
870
+ {
871
+ "epoch": 0.6854838709677419,
872
+ "grad_norm": 0.561292290687561,
873
+ "learning_rate": 0.00019803799658748094,
874
+ "loss": 0.5175400376319885,
875
+ "num_input_tokens_seen": 281468,
876
+ "step": 85,
877
+ "train_runtime": 520.6317,
878
+ "train_tokens_per_second": 540.628
879
+ },
880
+ {
881
+ "epoch": 0.6935483870967742,
882
+ "grad_norm": 0.5422635078430176,
883
+ "learning_rate": 0.000197987536819946,
884
+ "loss": 0.6259896755218506,
885
+ "num_input_tokens_seen": 284944,
886
+ "step": 86,
887
+ "train_runtime": 525.5795,
888
+ "train_tokens_per_second": 542.152
889
+ },
890
+ {
891
+ "epoch": 0.7016129032258065,
892
+ "grad_norm": 0.555388867855072,
893
+ "learning_rate": 0.0001979364429835339,
894
+ "loss": 0.5073646306991577,
895
+ "num_input_tokens_seen": 288252,
896
+ "step": 87,
897
+ "train_runtime": 530.5048,
898
+ "train_tokens_per_second": 543.354
899
+ },
900
+ {
901
+ "epoch": 0.7096774193548387,
902
+ "grad_norm": 0.5314076542854309,
903
+ "learning_rate": 0.00019788471540886844,
904
+ "loss": 0.5061646103858948,
905
+ "num_input_tokens_seen": 291674,
906
+ "step": 88,
907
+ "train_runtime": 535.3554,
908
+ "train_tokens_per_second": 544.823
909
+ },
910
+ {
911
+ "epoch": 0.717741935483871,
912
+ "grad_norm": 0.531013548374176,
913
+ "learning_rate": 0.0001978323544306743,
914
+ "loss": 0.5036947727203369,
915
+ "num_input_tokens_seen": 295054,
916
+ "step": 89,
917
+ "train_runtime": 540.1647,
918
+ "train_tokens_per_second": 546.23
919
+ },
920
+ {
921
+ "epoch": 0.7258064516129032,
922
+ "grad_norm": 0.7979210615158081,
923
+ "learning_rate": 0.00019777936038777483,
924
+ "loss": 0.613807201385498,
925
+ "num_input_tokens_seen": 298162,
926
+ "step": 90,
927
+ "train_runtime": 544.7969,
928
+ "train_tokens_per_second": 547.29
929
+ },
930
+ {
931
+ "epoch": 0.7258064516129032,
932
+ "eval_loss": 2.9071261882781982,
933
+ "eval_runtime": 12.2185,
934
+ "eval_samples_per_second": 4.256,
935
+ "eval_steps_per_second": 2.128,
936
+ "num_input_tokens_seen": 298162,
937
+ "step": 90
938
+ },
939
+ {
940
+ "epoch": 0.7338709677419355,
941
+ "grad_norm": 0.5745816826820374,
942
+ "learning_rate": 0.0001977257336230899,
943
+ "loss": 0.48962247371673584,
944
+ "num_input_tokens_seen": 301500,
945
+ "step": 91,
946
+ "train_runtime": 597.652,
947
+ "train_tokens_per_second": 504.474
948
+ },
949
+ {
950
+ "epoch": 0.7419354838709677,
951
+ "grad_norm": 0.7026141285896301,
952
+ "learning_rate": 0.00019767147448363366,
953
+ "loss": 0.5600932836532593,
954
+ "num_input_tokens_seen": 304624,
955
+ "step": 92,
956
+ "train_runtime": 602.1005,
957
+ "train_tokens_per_second": 505.936
958
+ },
959
+ {
960
+ "epoch": 0.75,
961
+ "grad_norm": 0.6634042859077454,
962
+ "learning_rate": 0.00019761658332051235,
963
+ "loss": 0.5759444236755371,
964
+ "num_input_tokens_seen": 307956,
965
+ "step": 93,
966
+ "train_runtime": 606.9852,
967
+ "train_tokens_per_second": 507.353
968
+ },
969
+ {
970
+ "epoch": 0.7580645161290323,
971
+ "grad_norm": 0.5837485790252686,
972
+ "learning_rate": 0.00019756106048892186,
973
+ "loss": 0.5830598473548889,
974
+ "num_input_tokens_seen": 311354,
975
+ "step": 94,
976
+ "train_runtime": 611.9247,
977
+ "train_tokens_per_second": 508.811
978
+ },
979
+ {
980
+ "epoch": 0.7661290322580645,
981
+ "grad_norm": 0.581575334072113,
982
+ "learning_rate": 0.00019750490634814572,
983
+ "loss": 0.5618096590042114,
984
+ "num_input_tokens_seen": 314592,
985
+ "step": 95,
986
+ "train_runtime": 616.8898,
987
+ "train_tokens_per_second": 509.965
988
+ },
989
+ {
990
+ "epoch": 0.7741935483870968,
991
+ "grad_norm": 0.5666610598564148,
992
+ "learning_rate": 0.00019744812126155245,
993
+ "loss": 0.5454350709915161,
994
+ "num_input_tokens_seen": 317960,
995
+ "step": 96,
996
+ "train_runtime": 622.062,
997
+ "train_tokens_per_second": 511.139
998
+ },
999
+ {
1000
+ "epoch": 0.782258064516129,
1001
+ "grad_norm": 0.5417006611824036,
1002
+ "learning_rate": 0.00019739070559659347,
1003
+ "loss": 0.561352014541626,
1004
+ "num_input_tokens_seen": 321274,
1005
+ "step": 97,
1006
+ "train_runtime": 627.2236,
1007
+ "train_tokens_per_second": 512.216
1008
+ },
1009
+ {
1010
+ "epoch": 0.7903225806451613,
1011
+ "grad_norm": 0.5439388751983643,
1012
+ "learning_rate": 0.0001973326597248006,
1013
+ "loss": 0.5779426097869873,
1014
+ "num_input_tokens_seen": 324480,
1015
+ "step": 98,
1016
+ "train_runtime": 632.176,
1017
+ "train_tokens_per_second": 513.275
1018
+ },
1019
+ {
1020
+ "epoch": 0.7983870967741935,
1021
+ "grad_norm": 0.5616183280944824,
1022
+ "learning_rate": 0.0001972739840217836,
1023
+ "loss": 0.49771565198898315,
1024
+ "num_input_tokens_seen": 327678,
1025
+ "step": 99,
1026
+ "train_runtime": 636.8674,
1027
+ "train_tokens_per_second": 514.515
1028
+ },
1029
+ {
1030
+ "epoch": 0.8064516129032258,
1031
+ "grad_norm": 0.8428842425346375,
1032
+ "learning_rate": 0.00019721467886722792,
1033
+ "loss": 0.5603002309799194,
1034
+ "num_input_tokens_seen": 330914,
1035
+ "step": 100,
1036
+ "train_runtime": 641.6893,
1037
+ "train_tokens_per_second": 515.692
1038
+ },
1039
+ {
1040
+ "epoch": 0.8145161290322581,
1041
+ "grad_norm": 0.6361693143844604,
1042
+ "learning_rate": 0.00019715474464489208,
1043
+ "loss": 0.5690774321556091,
1044
+ "num_input_tokens_seen": 334256,
1045
+ "step": 101,
1046
+ "train_runtime": 646.5226,
1047
+ "train_tokens_per_second": 517.006
1048
+ },
1049
+ {
1050
+ "epoch": 0.8225806451612904,
1051
+ "grad_norm": 0.5664668083190918,
1052
+ "learning_rate": 0.0001970941817426052,
1053
+ "loss": 0.5249347686767578,
1054
+ "num_input_tokens_seen": 337722,
1055
+ "step": 102,
1056
+ "train_runtime": 651.2588,
1057
+ "train_tokens_per_second": 518.568
1058
+ },
1059
+ {
1060
+ "epoch": 0.8306451612903226,
1061
+ "grad_norm": 0.564401388168335,
1062
+ "learning_rate": 0.00019703299055226468,
1063
+ "loss": 0.5265264511108398,
1064
+ "num_input_tokens_seen": 341038,
1065
+ "step": 103,
1066
+ "train_runtime": 656.0275,
1067
+ "train_tokens_per_second": 519.853
1068
+ },
1069
+ {
1070
+ "epoch": 0.8387096774193549,
1071
+ "grad_norm": 0.7196506857872009,
1072
+ "learning_rate": 0.00019697117146983334,
1073
+ "loss": 0.5534394979476929,
1074
+ "num_input_tokens_seen": 344066,
1075
+ "step": 104,
1076
+ "train_runtime": 660.2072,
1077
+ "train_tokens_per_second": 521.148
1078
+ },
1079
+ {
1080
+ "epoch": 0.8467741935483871,
1081
+ "grad_norm": 0.579501211643219,
1082
+ "learning_rate": 0.0001969087248953371,
1083
+ "loss": 0.5940977334976196,
1084
+ "num_input_tokens_seen": 347352,
1085
+ "step": 105,
1086
+ "train_runtime": 664.7483,
1087
+ "train_tokens_per_second": 522.532
1088
+ },
1089
+ {
1090
+ "epoch": 0.8548387096774194,
1091
+ "grad_norm": 0.5122610330581665,
1092
+ "learning_rate": 0.00019684565123286244,
1093
+ "loss": 0.5310789942741394,
1094
+ "num_input_tokens_seen": 350654,
1095
+ "step": 106,
1096
+ "train_runtime": 669.4554,
1097
+ "train_tokens_per_second": 523.79
1098
+ },
1099
+ {
1100
+ "epoch": 0.8629032258064516,
1101
+ "grad_norm": 0.5480422973632812,
1102
+ "learning_rate": 0.00019678195089055346,
1103
+ "loss": 0.5730823874473572,
1104
+ "num_input_tokens_seen": 353954,
1105
+ "step": 107,
1106
+ "train_runtime": 673.9271,
1107
+ "train_tokens_per_second": 525.211
1108
+ },
1109
+ {
1110
+ "epoch": 0.8709677419354839,
1111
+ "grad_norm": 0.5518767237663269,
1112
+ "learning_rate": 0.00019671762428060966,
1113
+ "loss": 0.553552508354187,
1114
+ "num_input_tokens_seen": 357348,
1115
+ "step": 108,
1116
+ "train_runtime": 678.7199,
1117
+ "train_tokens_per_second": 526.503
1118
+ },
1119
+ {
1120
+ "epoch": 0.8790322580645161,
1121
+ "grad_norm": 0.5668540596961975,
1122
+ "learning_rate": 0.00019665267181928292,
1123
+ "loss": 0.5294761657714844,
1124
+ "num_input_tokens_seen": 360628,
1125
+ "step": 109,
1126
+ "train_runtime": 683.35,
1127
+ "train_tokens_per_second": 527.735
1128
+ },
1129
+ {
1130
+ "epoch": 0.8870967741935484,
1131
+ "grad_norm": 0.4758228659629822,
1132
+ "learning_rate": 0.00019658709392687506,
1133
+ "loss": 0.5213898420333862,
1134
+ "num_input_tokens_seen": 364030,
1135
+ "step": 110,
1136
+ "train_runtime": 688.0325,
1137
+ "train_tokens_per_second": 529.088
1138
+ },
1139
+ {
1140
+ "epoch": 0.8951612903225806,
1141
+ "grad_norm": 0.604768693447113,
1142
+ "learning_rate": 0.00019652089102773488,
1143
+ "loss": 0.6025131940841675,
1144
+ "num_input_tokens_seen": 367360,
1145
+ "step": 111,
1146
+ "train_runtime": 692.8246,
1147
+ "train_tokens_per_second": 530.235
1148
+ },
1149
+ {
1150
+ "epoch": 0.9032258064516129,
1151
+ "grad_norm": 0.5635347366333008,
1152
+ "learning_rate": 0.00019645406355025565,
1153
+ "loss": 0.5061513185501099,
1154
+ "num_input_tokens_seen": 370518,
1155
+ "step": 112,
1156
+ "train_runtime": 697.3954,
1157
+ "train_tokens_per_second": 531.288
1158
+ },
1159
+ {
1160
+ "epoch": 0.9112903225806451,
1161
+ "grad_norm": 0.5575589537620544,
1162
+ "learning_rate": 0.00019638661192687216,
1163
+ "loss": 0.5043578147888184,
1164
+ "num_input_tokens_seen": 373828,
1165
+ "step": 113,
1166
+ "train_runtime": 702.2518,
1167
+ "train_tokens_per_second": 532.328
1168
+ },
1169
+ {
1170
+ "epoch": 0.9193548387096774,
1171
+ "grad_norm": 0.5947410464286804,
1172
+ "learning_rate": 0.00019631853659405807,
1173
+ "loss": 0.5288162231445312,
1174
+ "num_input_tokens_seen": 377226,
1175
+ "step": 114,
1176
+ "train_runtime": 707.3502,
1177
+ "train_tokens_per_second": 533.295
1178
+ },
1179
+ {
1180
+ "epoch": 0.9274193548387096,
1181
+ "grad_norm": 0.5738222599029541,
1182
+ "learning_rate": 0.000196249837992323,
1183
+ "loss": 0.5216498374938965,
1184
+ "num_input_tokens_seen": 380624,
1185
+ "step": 115,
1186
+ "train_runtime": 712.258,
1187
+ "train_tokens_per_second": 534.391
1188
+ },
1189
+ {
1190
+ "epoch": 0.9354838709677419,
1191
+ "grad_norm": 0.5514879822731018,
1192
+ "learning_rate": 0.0001961805165662096,
1193
+ "loss": 0.47155335545539856,
1194
+ "num_input_tokens_seen": 384012,
1195
+ "step": 116,
1196
+ "train_runtime": 717.2598,
1197
+ "train_tokens_per_second": 535.388
1198
+ },
1199
+ {
1200
+ "epoch": 0.9435483870967742,
1201
+ "grad_norm": 0.6073253154754639,
1202
+ "learning_rate": 0.00019611057276429085,
1203
+ "loss": 0.5124592781066895,
1204
+ "num_input_tokens_seen": 387364,
1205
+ "step": 117,
1206
+ "train_runtime": 722.0631,
1207
+ "train_tokens_per_second": 536.468
1208
+ },
1209
+ {
1210
+ "epoch": 0.9516129032258065,
1211
+ "grad_norm": 0.5801448822021484,
1212
+ "learning_rate": 0.00019604000703916705,
1213
+ "loss": 0.5122741460800171,
1214
+ "num_input_tokens_seen": 390740,
1215
+ "step": 118,
1216
+ "train_runtime": 726.8929,
1217
+ "train_tokens_per_second": 537.548
1218
+ },
1219
+ {
1220
+ "epoch": 0.9596774193548387,
1221
+ "grad_norm": 0.5871880650520325,
1222
+ "learning_rate": 0.00019596881984746287,
1223
+ "loss": 0.5125201940536499,
1224
+ "num_input_tokens_seen": 394002,
1225
+ "step": 119,
1226
+ "train_runtime": 731.6556,
1227
+ "train_tokens_per_second": 538.507
1228
+ },
1229
+ {
1230
+ "epoch": 0.967741935483871,
1231
+ "grad_norm": 0.6655816435813904,
1232
+ "learning_rate": 0.00019589701164982452,
1233
+ "loss": 0.524815022945404,
1234
+ "num_input_tokens_seen": 397036,
1235
+ "step": 120,
1236
+ "train_runtime": 735.9727,
1237
+ "train_tokens_per_second": 539.471
1238
+ },
1239
+ {
1240
+ "epoch": 0.967741935483871,
1241
+ "eval_loss": 2.589766025543213,
1242
+ "eval_runtime": 12.1494,
1243
+ "eval_samples_per_second": 4.28,
1244
+ "eval_steps_per_second": 2.14,
1245
+ "num_input_tokens_seen": 397036,
1246
+ "step": 120
1247
+ },
1248
+ {
1249
+ "epoch": 0.9758064516129032,
1250
+ "grad_norm": 0.6723934412002563,
1251
+ "learning_rate": 0.00019582458291091663,
1252
+ "loss": 0.611114501953125,
1253
+ "num_input_tokens_seen": 400216,
1254
+ "step": 121,
1255
+ "train_runtime": 772.7017,
1256
+ "train_tokens_per_second": 517.944
1257
+ },
1258
+ {
1259
+ "epoch": 0.9838709677419355,
1260
+ "grad_norm": 0.5538724064826965,
1261
+ "learning_rate": 0.0001957515340994193,
1262
+ "loss": 0.5213549137115479,
1263
+ "num_input_tokens_seen": 403522,
1264
+ "step": 122,
1265
+ "train_runtime": 777.3527,
1266
+ "train_tokens_per_second": 519.098
1267
+ },
1268
+ {
1269
+ "epoch": 0.9919354838709677,
1270
+ "grad_norm": 0.5994356274604797,
1271
+ "learning_rate": 0.000195677865688025,
1272
+ "loss": 0.5735296607017517,
1273
+ "num_input_tokens_seen": 406874,
1274
+ "step": 123,
1275
+ "train_runtime": 782.2491,
1276
+ "train_tokens_per_second": 520.134
1277
+ },
1278
+ {
1279
+ "epoch": 1.0,
1280
+ "grad_norm": 0.6571742296218872,
1281
+ "learning_rate": 0.00019560357815343577,
1282
+ "loss": 0.5385057926177979,
1283
+ "num_input_tokens_seen": 410114,
1284
+ "step": 124,
1285
+ "train_runtime": 787.0107,
1286
+ "train_tokens_per_second": 521.103
1287
+ },
1288
+ {
1289
+ "epoch": 1.0080645161290323,
1290
+ "grad_norm": 0.5309604406356812,
1291
+ "learning_rate": 0.00019552867197635974,
1292
+ "loss": 0.3922950029373169,
1293
+ "num_input_tokens_seen": 413264,
1294
+ "step": 125,
1295
+ "train_runtime": 791.8414,
1296
+ "train_tokens_per_second": 521.902
1297
+ },
1298
+ {
1299
+ "epoch": 1.0161290322580645,
1300
+ "grad_norm": 0.4756343960762024,
1301
+ "learning_rate": 0.00019545314764150837,
1302
+ "loss": 0.3772149085998535,
1303
+ "num_input_tokens_seen": 416432,
1304
+ "step": 126,
1305
+ "train_runtime": 796.7161,
1306
+ "train_tokens_per_second": 522.686
1307
+ },
1308
+ {
1309
+ "epoch": 1.0241935483870968,
1310
+ "grad_norm": 0.5248489379882812,
1311
+ "learning_rate": 0.00019537700563759304,
1312
+ "loss": 0.4322376251220703,
1313
+ "num_input_tokens_seen": 419814,
1314
+ "step": 127,
1315
+ "train_runtime": 801.9801,
1316
+ "train_tokens_per_second": 523.472
1317
+ },
1318
+ {
1319
+ "epoch": 1.032258064516129,
1320
+ "grad_norm": 0.5486618280410767,
1321
+ "learning_rate": 0.00019530024645732206,
1322
+ "loss": 0.42461395263671875,
1323
+ "num_input_tokens_seen": 423168,
1324
+ "step": 128,
1325
+ "train_runtime": 807.1128,
1326
+ "train_tokens_per_second": 524.298
1327
+ },
1328
+ {
1329
+ "epoch": 1.0403225806451613,
1330
+ "grad_norm": 0.6822044253349304,
1331
+ "learning_rate": 0.00019522287059739753,
1332
+ "loss": 0.44680896401405334,
1333
+ "num_input_tokens_seen": 426248,
1334
+ "step": 129,
1335
+ "train_runtime": 811.7053,
1336
+ "train_tokens_per_second": 525.127
1337
+ },
1338
+ {
1339
+ "epoch": 1.0483870967741935,
1340
+ "grad_norm": 0.6571690440177917,
1341
+ "learning_rate": 0.00019514487855851184,
1342
+ "loss": 0.45198315382003784,
1343
+ "num_input_tokens_seen": 429726,
1344
+ "step": 130,
1345
+ "train_runtime": 816.7684,
1346
+ "train_tokens_per_second": 526.13
1347
+ },
1348
+ {
1349
+ "epoch": 1.0564516129032258,
1350
+ "grad_norm": 0.6491777896881104,
1351
+ "learning_rate": 0.00019506627084534483,
1352
+ "loss": 0.4377821683883667,
1353
+ "num_input_tokens_seen": 432912,
1354
+ "step": 131,
1355
+ "train_runtime": 821.4589,
1356
+ "train_tokens_per_second": 527.004
1357
+ },
1358
+ {
1359
+ "epoch": 1.064516129032258,
1360
+ "grad_norm": 0.6005299091339111,
1361
+ "learning_rate": 0.00019498704796656018,
1362
+ "loss": 0.45241302251815796,
1363
+ "num_input_tokens_seen": 436246,
1364
+ "step": 132,
1365
+ "train_runtime": 826.1551,
1366
+ "train_tokens_per_second": 528.044
1367
+ },
1368
+ {
1369
+ "epoch": 1.0725806451612903,
1370
+ "grad_norm": 0.5644078850746155,
1371
+ "learning_rate": 0.00019490721043480226,
1372
+ "loss": 0.4442741274833679,
1373
+ "num_input_tokens_seen": 439446,
1374
+ "step": 133,
1375
+ "train_runtime": 830.7581,
1376
+ "train_tokens_per_second": 528.97
1377
+ },
1378
+ {
1379
+ "epoch": 1.0806451612903225,
1380
+ "grad_norm": 0.5728560090065002,
1381
+ "learning_rate": 0.00019482675876669286,
1382
+ "loss": 0.4184889793395996,
1383
+ "num_input_tokens_seen": 442716,
1384
+ "step": 134,
1385
+ "train_runtime": 835.3283,
1386
+ "train_tokens_per_second": 529.99
1387
+ },
1388
+ {
1389
+ "epoch": 1.0887096774193548,
1390
+ "grad_norm": 0.5105488896369934,
1391
+ "learning_rate": 0.00019474569348282774,
1392
+ "loss": 0.4137258529663086,
1393
+ "num_input_tokens_seen": 445948,
1394
+ "step": 135,
1395
+ "train_runtime": 839.8381,
1396
+ "train_tokens_per_second": 530.993
1397
+ },
1398
+ {
1399
+ "epoch": 1.096774193548387,
1400
+ "grad_norm": 0.6140270829200745,
1401
+ "learning_rate": 0.0001946640151077734,
1402
+ "loss": 0.4656698703765869,
1403
+ "num_input_tokens_seen": 449172,
1404
+ "step": 136,
1405
+ "train_runtime": 844.4082,
1406
+ "train_tokens_per_second": 531.937
1407
+ },
1408
+ {
1409
+ "epoch": 1.1048387096774193,
1410
+ "grad_norm": 0.6442403793334961,
1411
+ "learning_rate": 0.00019458172417006347,
1412
+ "loss": 0.519420325756073,
1413
+ "num_input_tokens_seen": 452512,
1414
+ "step": 137,
1415
+ "train_runtime": 848.9974,
1416
+ "train_tokens_per_second": 532.996
1417
+ },
1418
+ {
1419
+ "epoch": 1.1129032258064515,
1420
+ "grad_norm": 0.5361612439155579,
1421
+ "learning_rate": 0.00019449882120219555,
1422
+ "loss": 0.4331057667732239,
1423
+ "num_input_tokens_seen": 455922,
1424
+ "step": 138,
1425
+ "train_runtime": 853.7922,
1426
+ "train_tokens_per_second": 533.996
1427
+ },
1428
+ {
1429
+ "epoch": 1.120967741935484,
1430
+ "grad_norm": 0.56069415807724,
1431
+ "learning_rate": 0.00019441530674062753,
1432
+ "loss": 0.43306776881217957,
1433
+ "num_input_tokens_seen": 459246,
1434
+ "step": 139,
1435
+ "train_runtime": 858.4641,
1436
+ "train_tokens_per_second": 534.962
1437
+ },
1438
+ {
1439
+ "epoch": 1.129032258064516,
1440
+ "grad_norm": 0.5961108803749084,
1441
+ "learning_rate": 0.0001943311813257743,
1442
+ "loss": 0.38593000173568726,
1443
+ "num_input_tokens_seen": 462644,
1444
+ "step": 140,
1445
+ "train_runtime": 863.174,
1446
+ "train_tokens_per_second": 535.98
1447
+ },
1448
+ {
1449
+ "epoch": 1.1370967741935485,
1450
+ "grad_norm": 0.5356931090354919,
1451
+ "learning_rate": 0.00019424644550200415,
1452
+ "loss": 0.45368602871894836,
1453
+ "num_input_tokens_seen": 466004,
1454
+ "step": 141,
1455
+ "train_runtime": 867.9995,
1456
+ "train_tokens_per_second": 536.871
1457
+ },
1458
+ {
1459
+ "epoch": 1.1451612903225807,
1460
+ "grad_norm": 0.6337073445320129,
1461
+ "learning_rate": 0.00019416109981763526,
1462
+ "loss": 0.4836081266403198,
1463
+ "num_input_tokens_seen": 469474,
1464
+ "step": 142,
1465
+ "train_runtime": 872.8737,
1466
+ "train_tokens_per_second": 537.849
1467
+ },
1468
+ {
1469
+ "epoch": 1.153225806451613,
1470
+ "grad_norm": 0.5758848786354065,
1471
+ "learning_rate": 0.00019407514482493214,
1472
+ "loss": 0.39848586916923523,
1473
+ "num_input_tokens_seen": 472830,
1474
+ "step": 143,
1475
+ "train_runtime": 877.6996,
1476
+ "train_tokens_per_second": 538.715
1477
+ },
1478
+ {
1479
+ "epoch": 1.1612903225806452,
1480
+ "grad_norm": 0.580061674118042,
1481
+ "learning_rate": 0.00019398858108010217,
1482
+ "loss": 0.4755861461162567,
1483
+ "num_input_tokens_seen": 476146,
1484
+ "step": 144,
1485
+ "train_runtime": 882.5729,
1486
+ "train_tokens_per_second": 539.498
1487
+ },
1488
+ {
1489
+ "epoch": 1.1693548387096775,
1490
+ "grad_norm": 0.6298698782920837,
1491
+ "learning_rate": 0.0001939014091432918,
1492
+ "loss": 0.48262229561805725,
1493
+ "num_input_tokens_seen": 479388,
1494
+ "step": 145,
1495
+ "train_runtime": 887.2589,
1496
+ "train_tokens_per_second": 540.302
1497
+ },
1498
+ {
1499
+ "epoch": 1.1774193548387097,
1500
+ "grad_norm": 0.5250868201255798,
1501
+ "learning_rate": 0.00019381362957858312,
1502
+ "loss": 0.46615517139434814,
1503
+ "num_input_tokens_seen": 482820,
1504
+ "step": 146,
1505
+ "train_runtime": 892.2828,
1506
+ "train_tokens_per_second": 541.106
1507
+ },
1508
+ {
1509
+ "epoch": 1.185483870967742,
1510
+ "grad_norm": 0.5974312424659729,
1511
+ "learning_rate": 0.00019372524295399013,
1512
+ "loss": 0.4052748680114746,
1513
+ "num_input_tokens_seen": 485956,
1514
+ "step": 147,
1515
+ "train_runtime": 896.8248,
1516
+ "train_tokens_per_second": 541.863
1517
+ },
1518
+ {
1519
+ "epoch": 1.1935483870967742,
1520
+ "grad_norm": 0.4935814440250397,
1521
+ "learning_rate": 0.00019363624984145502,
1522
+ "loss": 0.4510989189147949,
1523
+ "num_input_tokens_seen": 489266,
1524
+ "step": 148,
1525
+ "train_runtime": 901.6055,
1526
+ "train_tokens_per_second": 542.661
1527
+ },
1528
+ {
1529
+ "epoch": 1.2016129032258065,
1530
+ "grad_norm": 0.5875913500785828,
1531
+ "learning_rate": 0.00019354665081684446,
1532
+ "loss": 0.39551085233688354,
1533
+ "num_input_tokens_seen": 492712,
1534
+ "step": 149,
1535
+ "train_runtime": 906.6105,
1536
+ "train_tokens_per_second": 543.466
1537
+ },
1538
+ {
1539
+ "epoch": 1.2096774193548387,
1540
+ "grad_norm": 0.5005422234535217,
1541
+ "learning_rate": 0.0001934564464599461,
1542
+ "loss": 0.37073755264282227,
1543
+ "num_input_tokens_seen": 496182,
1544
+ "step": 150,
1545
+ "train_runtime": 911.4473,
1546
+ "train_tokens_per_second": 544.389
1547
+ },
1548
+ {
1549
+ "epoch": 1.2096774193548387,
1550
+ "eval_loss": 2.5809271335601807,
1551
+ "eval_runtime": 12.2545,
1552
+ "eval_samples_per_second": 4.243,
1553
+ "eval_steps_per_second": 2.122,
1554
+ "num_input_tokens_seen": 496182,
1555
+ "step": 150
1556
+ },
1557
+ {
1558
+ "epoch": 1.217741935483871,
1559
+ "grad_norm": 0.6077068448066711,
1560
+ "learning_rate": 0.00019336563735446446,
1561
+ "loss": 0.46918922662734985,
1562
+ "num_input_tokens_seen": 499482,
1563
+ "step": 151,
1564
+ "train_runtime": 946.9561,
1565
+ "train_tokens_per_second": 527.461
1566
+ },
1567
+ {
1568
+ "epoch": 1.2258064516129032,
1569
+ "grad_norm": 0.5957709550857544,
1570
+ "learning_rate": 0.00019327422408801744,
1571
+ "loss": 0.43228402733802795,
1572
+ "num_input_tokens_seen": 502832,
1573
+ "step": 152,
1574
+ "train_runtime": 951.617,
1575
+ "train_tokens_per_second": 528.397
1576
+ },
1577
+ {
1578
+ "epoch": 1.2338709677419355,
1579
+ "grad_norm": 0.5242084264755249,
1580
+ "learning_rate": 0.0001931822072521323,
1581
+ "loss": 0.40411657094955444,
1582
+ "num_input_tokens_seen": 506196,
1583
+ "step": 153,
1584
+ "train_runtime": 956.5068,
1585
+ "train_tokens_per_second": 529.213
1586
+ },
1587
+ {
1588
+ "epoch": 1.2419354838709677,
1589
+ "grad_norm": 0.6395658254623413,
1590
+ "learning_rate": 0.00019308958744224217,
1591
+ "loss": 0.41684025526046753,
1592
+ "num_input_tokens_seen": 509348,
1593
+ "step": 154,
1594
+ "train_runtime": 961.0623,
1595
+ "train_tokens_per_second": 529.984
1596
+ },
1597
+ {
1598
+ "epoch": 1.25,
1599
+ "grad_norm": 0.5764552354812622,
1600
+ "learning_rate": 0.00019299636525768173,
1601
+ "loss": 0.42104196548461914,
1602
+ "num_input_tokens_seen": 512728,
1603
+ "step": 155,
1604
+ "train_runtime": 966.1691,
1605
+ "train_tokens_per_second": 530.681
1606
+ },
1607
+ {
1608
+ "epoch": 1.2580645161290323,
1609
+ "grad_norm": 0.6039356589317322,
1610
+ "learning_rate": 0.00019290254130168374,
1611
+ "loss": 0.41675058007240295,
1612
+ "num_input_tokens_seen": 516078,
1613
+ "step": 156,
1614
+ "train_runtime": 971.2647,
1615
+ "train_tokens_per_second": 531.346
1616
+ },
1617
+ {
1618
+ "epoch": 1.2661290322580645,
1619
+ "grad_norm": 0.5306972861289978,
1620
+ "learning_rate": 0.00019280811618137484,
1621
+ "loss": 0.44224196672439575,
1622
+ "num_input_tokens_seen": 519482,
1623
+ "step": 157,
1624
+ "train_runtime": 976.5865,
1625
+ "train_tokens_per_second": 531.936
1626
+ },
1627
+ {
1628
+ "epoch": 1.2741935483870968,
1629
+ "grad_norm": 0.5725175142288208,
1630
+ "learning_rate": 0.00019271309050777183,
1631
+ "loss": 0.4407888650894165,
1632
+ "num_input_tokens_seen": 522758,
1633
+ "step": 158,
1634
+ "train_runtime": 981.664,
1635
+ "train_tokens_per_second": 532.522
1636
+ },
1637
+ {
1638
+ "epoch": 1.282258064516129,
1639
+ "grad_norm": 0.5713056921958923,
1640
+ "learning_rate": 0.00019261746489577765,
1641
+ "loss": 0.4012797474861145,
1642
+ "num_input_tokens_seen": 525784,
1643
+ "step": 159,
1644
+ "train_runtime": 986.2122,
1645
+ "train_tokens_per_second": 533.135
1646
+ },
1647
+ {
1648
+ "epoch": 1.2903225806451613,
1649
+ "grad_norm": 0.4923282265663147,
1650
+ "learning_rate": 0.00019252123996417738,
1651
+ "loss": 0.3848939836025238,
1652
+ "num_input_tokens_seen": 529130,
1653
+ "step": 160,
1654
+ "train_runtime": 991.1125,
1655
+ "train_tokens_per_second": 533.875
1656
+ },
1657
+ {
1658
+ "epoch": 1.2983870967741935,
1659
+ "grad_norm": 0.5265042185783386,
1660
+ "learning_rate": 0.00019242441633563417,
1661
+ "loss": 0.4451833665370941,
1662
+ "num_input_tokens_seen": 532308,
1663
+ "step": 161,
1664
+ "train_runtime": 995.6839,
1665
+ "train_tokens_per_second": 534.615
1666
+ },
1667
+ {
1668
+ "epoch": 1.3064516129032258,
1669
+ "grad_norm": 0.499607652425766,
1670
+ "learning_rate": 0.00019232699463668542,
1671
+ "loss": 0.37897300720214844,
1672
+ "num_input_tokens_seen": 535662,
1673
+ "step": 162,
1674
+ "train_runtime": 1000.3675,
1675
+ "train_tokens_per_second": 535.465
1676
+ },
1677
+ {
1678
+ "epoch": 1.314516129032258,
1679
+ "grad_norm": 0.6007817983627319,
1680
+ "learning_rate": 0.00019222897549773848,
1681
+ "loss": 0.40716710686683655,
1682
+ "num_input_tokens_seen": 539056,
1683
+ "step": 163,
1684
+ "train_runtime": 1005.1457,
1685
+ "train_tokens_per_second": 536.296
1686
+ },
1687
+ {
1688
+ "epoch": 1.3225806451612903,
1689
+ "grad_norm": 0.5382058620452881,
1690
+ "learning_rate": 0.0001921303595530667,
1691
+ "loss": 0.3810466527938843,
1692
+ "num_input_tokens_seen": 542450,
1693
+ "step": 164,
1694
+ "train_runtime": 1009.817,
1695
+ "train_tokens_per_second": 537.177
1696
+ },
1697
+ {
1698
+ "epoch": 1.3306451612903225,
1699
+ "grad_norm": 0.7238770127296448,
1700
+ "learning_rate": 0.00019203114744080542,
1701
+ "loss": 0.48473918437957764,
1702
+ "num_input_tokens_seen": 545796,
1703
+ "step": 165,
1704
+ "train_runtime": 1014.3995,
1705
+ "train_tokens_per_second": 538.048
1706
+ },
1707
+ {
1708
+ "epoch": 1.3387096774193548,
1709
+ "grad_norm": 0.6477717757225037,
1710
+ "learning_rate": 0.0001919313398029475,
1711
+ "loss": 0.44077277183532715,
1712
+ "num_input_tokens_seen": 548960,
1713
+ "step": 166,
1714
+ "train_runtime": 1018.9076,
1715
+ "train_tokens_per_second": 538.773
1716
+ },
1717
+ {
1718
+ "epoch": 1.346774193548387,
1719
+ "grad_norm": 0.5776754021644592,
1720
+ "learning_rate": 0.00019183093728533966,
1721
+ "loss": 0.428521066904068,
1722
+ "num_input_tokens_seen": 552310,
1723
+ "step": 167,
1724
+ "train_runtime": 1023.4907,
1725
+ "train_tokens_per_second": 539.634
1726
+ },
1727
+ {
1728
+ "epoch": 1.3548387096774195,
1729
+ "grad_norm": 0.589496910572052,
1730
+ "learning_rate": 0.00019172994053767784,
1731
+ "loss": 0.44079262018203735,
1732
+ "num_input_tokens_seen": 555660,
1733
+ "step": 168,
1734
+ "train_runtime": 1028.1528,
1735
+ "train_tokens_per_second": 540.445
1736
+ },
1737
+ {
1738
+ "epoch": 1.3629032258064515,
1739
+ "grad_norm": 0.5510254502296448,
1740
+ "learning_rate": 0.0001916283502135033,
1741
+ "loss": 0.4306449592113495,
1742
+ "num_input_tokens_seen": 558942,
1743
+ "step": 169,
1744
+ "train_runtime": 1032.8701,
1745
+ "train_tokens_per_second": 541.154
1746
+ },
1747
+ {
1748
+ "epoch": 1.370967741935484,
1749
+ "grad_norm": 0.6984896063804626,
1750
+ "learning_rate": 0.00019152616697019822,
1751
+ "loss": 0.4740030765533447,
1752
+ "num_input_tokens_seen": 561914,
1753
+ "step": 170,
1754
+ "train_runtime": 1037.1046,
1755
+ "train_tokens_per_second": 541.81
1756
+ },
1757
+ {
1758
+ "epoch": 1.379032258064516,
1759
+ "grad_norm": 0.5186349153518677,
1760
+ "learning_rate": 0.0001914233914689815,
1761
+ "loss": 0.46226799488067627,
1762
+ "num_input_tokens_seen": 565128,
1763
+ "step": 171,
1764
+ "train_runtime": 1041.8057,
1765
+ "train_tokens_per_second": 542.45
1766
+ },
1767
+ {
1768
+ "epoch": 1.3870967741935485,
1769
+ "grad_norm": 0.6191644668579102,
1770
+ "learning_rate": 0.00019132002437490458,
1771
+ "loss": 0.4157659709453583,
1772
+ "num_input_tokens_seen": 568492,
1773
+ "step": 172,
1774
+ "train_runtime": 1046.6025,
1775
+ "train_tokens_per_second": 543.179
1776
+ },
1777
+ {
1778
+ "epoch": 1.3951612903225805,
1779
+ "grad_norm": 0.5917848944664001,
1780
+ "learning_rate": 0.00019121606635684696,
1781
+ "loss": 0.4816833436489105,
1782
+ "num_input_tokens_seen": 571838,
1783
+ "step": 173,
1784
+ "train_runtime": 1051.4773,
1785
+ "train_tokens_per_second": 543.842
1786
+ },
1787
+ {
1788
+ "epoch": 1.403225806451613,
1789
+ "grad_norm": 0.4845684766769409,
1790
+ "learning_rate": 0.00019111151808751196,
1791
+ "loss": 0.3603062629699707,
1792
+ "num_input_tokens_seen": 575172,
1793
+ "step": 174,
1794
+ "train_runtime": 1056.4402,
1795
+ "train_tokens_per_second": 544.444
1796
+ },
1797
+ {
1798
+ "epoch": 1.4112903225806452,
1799
+ "grad_norm": 0.6082286238670349,
1800
+ "learning_rate": 0.00019100638024342244,
1801
+ "loss": 0.4300457835197449,
1802
+ "num_input_tokens_seen": 578472,
1803
+ "step": 175,
1804
+ "train_runtime": 1061.2665,
1805
+ "train_tokens_per_second": 545.077
1806
+ },
1807
+ {
1808
+ "epoch": 1.4193548387096775,
1809
+ "grad_norm": 0.5522608160972595,
1810
+ "learning_rate": 0.00019090065350491626,
1811
+ "loss": 0.41547319293022156,
1812
+ "num_input_tokens_seen": 581854,
1813
+ "step": 176,
1814
+ "train_runtime": 1066.3155,
1815
+ "train_tokens_per_second": 545.668
1816
+ },
1817
+ {
1818
+ "epoch": 1.4274193548387097,
1819
+ "grad_norm": 0.553947925567627,
1820
+ "learning_rate": 0.00019079433855614201,
1821
+ "loss": 0.4243569076061249,
1822
+ "num_input_tokens_seen": 585240,
1823
+ "step": 177,
1824
+ "train_runtime": 1071.1687,
1825
+ "train_tokens_per_second": 546.356
1826
+ },
1827
+ {
1828
+ "epoch": 1.435483870967742,
1829
+ "grad_norm": 0.7811908721923828,
1830
+ "learning_rate": 0.00019068743608505455,
1831
+ "loss": 0.46935024857521057,
1832
+ "num_input_tokens_seen": 588466,
1833
+ "step": 178,
1834
+ "train_runtime": 1075.7542,
1835
+ "train_tokens_per_second": 547.026
1836
+ },
1837
+ {
1838
+ "epoch": 1.4435483870967742,
1839
+ "grad_norm": 0.6735747456550598,
1840
+ "learning_rate": 0.0001905799467834105,
1841
+ "loss": 0.4160788357257843,
1842
+ "num_input_tokens_seen": 591716,
1843
+ "step": 179,
1844
+ "train_runtime": 1080.4651,
1845
+ "train_tokens_per_second": 547.649
1846
+ },
1847
+ {
1848
+ "epoch": 1.4516129032258065,
1849
+ "grad_norm": 0.563190221786499,
1850
+ "learning_rate": 0.00019047187134676387,
1851
+ "loss": 0.4146038293838501,
1852
+ "num_input_tokens_seen": 595124,
1853
+ "step": 180,
1854
+ "train_runtime": 1085.2463,
1855
+ "train_tokens_per_second": 548.377
1856
+ },
1857
+ {
1858
+ "epoch": 1.4516129032258065,
1859
+ "eval_loss": 2.515367269515991,
1860
+ "eval_runtime": 12.2457,
1861
+ "eval_samples_per_second": 4.246,
1862
+ "eval_steps_per_second": 2.123,
1863
+ "num_input_tokens_seen": 595124,
1864
+ "step": 180
1865
+ },
1866
+ {
1867
+ "epoch": 1.4596774193548387,
1868
+ "grad_norm": 0.5683964490890503,
1869
+ "learning_rate": 0.0001903632104744614,
1870
+ "loss": 0.4067952334880829,
1871
+ "num_input_tokens_seen": 598560,
1872
+ "step": 181,
1873
+ "train_runtime": 1121.3958,
1874
+ "train_tokens_per_second": 533.763
1875
+ },
1876
+ {
1877
+ "epoch": 1.467741935483871,
1878
+ "grad_norm": 0.7010737061500549,
1879
+ "learning_rate": 0.00019025396486963827,
1880
+ "loss": 0.47173696756362915,
1881
+ "num_input_tokens_seen": 601832,
1882
+ "step": 182,
1883
+ "train_runtime": 1125.8583,
1884
+ "train_tokens_per_second": 534.554
1885
+ },
1886
+ {
1887
+ "epoch": 1.4758064516129032,
1888
+ "grad_norm": 0.5702352523803711,
1889
+ "learning_rate": 0.0001901441352392133,
1890
+ "loss": 0.45567232370376587,
1891
+ "num_input_tokens_seen": 605124,
1892
+ "step": 183,
1893
+ "train_runtime": 1130.613,
1894
+ "train_tokens_per_second": 535.218
1895
+ },
1896
+ {
1897
+ "epoch": 1.4838709677419355,
1898
+ "grad_norm": 0.5321019291877747,
1899
+ "learning_rate": 0.00019003372229388452,
1900
+ "loss": 0.44429701566696167,
1901
+ "num_input_tokens_seen": 608550,
1902
+ "step": 184,
1903
+ "train_runtime": 1135.4582,
1904
+ "train_tokens_per_second": 535.951
1905
+ },
1906
+ {
1907
+ "epoch": 1.4919354838709677,
1908
+ "grad_norm": 0.5422691106796265,
1909
+ "learning_rate": 0.0001899227267481246,
1910
+ "loss": 0.4688451588153839,
1911
+ "num_input_tokens_seen": 611912,
1912
+ "step": 185,
1913
+ "train_runtime": 1140.4185,
1914
+ "train_tokens_per_second": 536.568
1915
+ },
1916
+ {
1917
+ "epoch": 1.5,
1918
+ "grad_norm": 0.5946955680847168,
1919
+ "learning_rate": 0.00018981114932017609,
1920
+ "loss": 0.4600903391838074,
1921
+ "num_input_tokens_seen": 615114,
1922
+ "step": 186,
1923
+ "train_runtime": 1145.289,
1924
+ "train_tokens_per_second": 537.082
1925
+ },
1926
+ {
1927
+ "epoch": 1.5080645161290323,
1928
+ "grad_norm": 0.5127443075180054,
1929
+ "learning_rate": 0.00018969899073204686,
1930
+ "loss": 0.3844794034957886,
1931
+ "num_input_tokens_seen": 618532,
1932
+ "step": 187,
1933
+ "train_runtime": 1150.4096,
1934
+ "train_tokens_per_second": 537.662
1935
+ },
1936
+ {
1937
+ "epoch": 1.5161290322580645,
1938
+ "grad_norm": 0.592376172542572,
1939
+ "learning_rate": 0.00018958625170950545,
1940
+ "loss": 0.4546698033809662,
1941
+ "num_input_tokens_seen": 621584,
1942
+ "step": 188,
1943
+ "train_runtime": 1155.2548,
1944
+ "train_tokens_per_second": 538.049
1945
+ },
1946
+ {
1947
+ "epoch": 1.5241935483870968,
1948
+ "grad_norm": 0.5506513714790344,
1949
+ "learning_rate": 0.00018947293298207635,
1950
+ "loss": 0.45888251066207886,
1951
+ "num_input_tokens_seen": 624890,
1952
+ "step": 189,
1953
+ "train_runtime": 1160.301,
1954
+ "train_tokens_per_second": 538.559
1955
+ },
1956
+ {
1957
+ "epoch": 1.532258064516129,
1958
+ "grad_norm": 0.7385762929916382,
1959
+ "learning_rate": 0.00018935903528303523,
1960
+ "loss": 0.47872915863990784,
1961
+ "num_input_tokens_seen": 627898,
1962
+ "step": 190,
1963
+ "train_runtime": 1164.9125,
1964
+ "train_tokens_per_second": 539.009
1965
+ },
1966
+ {
1967
+ "epoch": 1.5403225806451613,
1968
+ "grad_norm": 0.48639681935310364,
1969
+ "learning_rate": 0.0001892445593494042,
1970
+ "loss": 0.3977377414703369,
1971
+ "num_input_tokens_seen": 631260,
1972
+ "step": 191,
1973
+ "train_runtime": 1169.8778,
1974
+ "train_tokens_per_second": 539.595
1975
+ },
1976
+ {
1977
+ "epoch": 1.5483870967741935,
1978
+ "grad_norm": 0.5805647969245911,
1979
+ "learning_rate": 0.0001891295059219472,
1980
+ "loss": 0.4197312593460083,
1981
+ "num_input_tokens_seen": 634598,
1982
+ "step": 192,
1983
+ "train_runtime": 1174.6329,
1984
+ "train_tokens_per_second": 540.252
1985
+ },
1986
+ {
1987
+ "epoch": 1.5564516129032258,
1988
+ "grad_norm": 0.55940842628479,
1989
+ "learning_rate": 0.00018901387574516497,
1990
+ "loss": 0.45023104548454285,
1991
+ "num_input_tokens_seen": 637776,
1992
+ "step": 193,
1993
+ "train_runtime": 1179.3065,
1994
+ "train_tokens_per_second": 540.806
1995
+ },
1996
+ {
1997
+ "epoch": 1.564516129032258,
1998
+ "grad_norm": 0.5386122465133667,
1999
+ "learning_rate": 0.00018889766956729044,
2000
+ "loss": 0.4272036850452423,
2001
+ "num_input_tokens_seen": 641100,
2002
+ "step": 194,
2003
+ "train_runtime": 1183.9205,
2004
+ "train_tokens_per_second": 541.506
2005
+ },
2006
+ {
2007
+ "epoch": 1.5725806451612905,
2008
+ "grad_norm": 0.5532417297363281,
2009
+ "learning_rate": 0.00018878088814028364,
2010
+ "loss": 0.40990498661994934,
2011
+ "num_input_tokens_seen": 644472,
2012
+ "step": 195,
2013
+ "train_runtime": 1188.6266,
2014
+ "train_tokens_per_second": 542.199
2015
+ },
2016
+ {
2017
+ "epoch": 1.5806451612903225,
2018
+ "grad_norm": 0.5226889252662659,
2019
+ "learning_rate": 0.00018866353221982718,
2020
+ "loss": 0.3513818383216858,
2021
+ "num_input_tokens_seen": 647738,
2022
+ "step": 196,
2023
+ "train_runtime": 1193.3282,
2024
+ "train_tokens_per_second": 542.8
2025
+ },
2026
+ {
2027
+ "epoch": 1.588709677419355,
2028
+ "grad_norm": 0.6106642484664917,
2029
+ "learning_rate": 0.000188545602565321,
2030
+ "loss": 0.4655504822731018,
2031
+ "num_input_tokens_seen": 651030,
2032
+ "step": 197,
2033
+ "train_runtime": 1197.831,
2034
+ "train_tokens_per_second": 543.507
2035
+ },
2036
+ {
2037
+ "epoch": 1.596774193548387,
2038
+ "grad_norm": 0.6960707902908325,
2039
+ "learning_rate": 0.00018842709993987776,
2040
+ "loss": 0.46613919734954834,
2041
+ "num_input_tokens_seen": 654362,
2042
+ "step": 198,
2043
+ "train_runtime": 1202.5116,
2044
+ "train_tokens_per_second": 544.163
2045
+ },
2046
+ {
2047
+ "epoch": 1.6048387096774195,
2048
+ "grad_norm": 0.6360778212547302,
2049
+ "learning_rate": 0.00018830802511031762,
2050
+ "loss": 0.40627503395080566,
2051
+ "num_input_tokens_seen": 657668,
2052
+ "step": 199,
2053
+ "train_runtime": 1207.1242,
2054
+ "train_tokens_per_second": 544.822
2055
+ },
2056
+ {
2057
+ "epoch": 1.6129032258064515,
2058
+ "grad_norm": 0.5545072555541992,
2059
+ "learning_rate": 0.0001881883788471636,
2060
+ "loss": 0.4130961298942566,
2061
+ "num_input_tokens_seen": 661006,
2062
+ "step": 200,
2063
+ "train_runtime": 1211.7449,
2064
+ "train_tokens_per_second": 545.499
2065
+ },
2066
+ {
2067
+ "epoch": 1.620967741935484,
2068
+ "grad_norm": 0.5094169974327087,
2069
+ "learning_rate": 0.00018806816192463625,
2070
+ "loss": 0.40930330753326416,
2071
+ "num_input_tokens_seen": 664466,
2072
+ "step": 201,
2073
+ "train_runtime": 1216.6247,
2074
+ "train_tokens_per_second": 546.155
2075
+ },
2076
+ {
2077
+ "epoch": 1.629032258064516,
2078
+ "grad_norm": 0.6566628813743591,
2079
+ "learning_rate": 0.0001879473751206489,
2080
+ "loss": 0.4623265266418457,
2081
+ "num_input_tokens_seen": 667634,
2082
+ "step": 202,
2083
+ "train_runtime": 1221.1278,
2084
+ "train_tokens_per_second": 546.736
2085
+ },
2086
+ {
2087
+ "epoch": 1.6370967741935485,
2088
+ "grad_norm": 0.542569100856781,
2089
+ "learning_rate": 0.00018782601921680256,
2090
+ "loss": 0.44509240984916687,
2091
+ "num_input_tokens_seen": 671080,
2092
+ "step": 203,
2093
+ "train_runtime": 1225.9964,
2094
+ "train_tokens_per_second": 547.375
2095
+ },
2096
+ {
2097
+ "epoch": 1.6451612903225805,
2098
+ "grad_norm": 0.628744900226593,
2099
+ "learning_rate": 0.00018770409499838073,
2100
+ "loss": 0.4878479540348053,
2101
+ "num_input_tokens_seen": 674508,
2102
+ "step": 204,
2103
+ "train_runtime": 1230.9685,
2104
+ "train_tokens_per_second": 547.949
2105
+ },
2106
+ {
2107
+ "epoch": 1.653225806451613,
2108
+ "grad_norm": 0.558655858039856,
2109
+ "learning_rate": 0.0001875816032543445,
2110
+ "loss": 0.41668811440467834,
2111
+ "num_input_tokens_seen": 677700,
2112
+ "step": 205,
2113
+ "train_runtime": 1235.5957,
2114
+ "train_tokens_per_second": 548.48
2115
+ },
2116
+ {
2117
+ "epoch": 1.661290322580645,
2118
+ "grad_norm": 0.5680912137031555,
2119
+ "learning_rate": 0.00018745854477732733,
2120
+ "loss": 0.42818352580070496,
2121
+ "num_input_tokens_seen": 680978,
2122
+ "step": 206,
2123
+ "train_runtime": 1240.4198,
2124
+ "train_tokens_per_second": 548.99
2125
+ },
2126
+ {
2127
+ "epoch": 1.6693548387096775,
2128
+ "grad_norm": 0.5377702713012695,
2129
+ "learning_rate": 0.00018733492036363005,
2130
+ "loss": 0.4158581495285034,
2131
+ "num_input_tokens_seen": 684328,
2132
+ "step": 207,
2133
+ "train_runtime": 1245.2876,
2134
+ "train_tokens_per_second": 549.534
2135
+ },
2136
+ {
2137
+ "epoch": 1.6774193548387095,
2138
+ "grad_norm": 0.5735633969306946,
2139
+ "learning_rate": 0.0001872107308132155,
2140
+ "loss": 0.4311006963253021,
2141
+ "num_input_tokens_seen": 687676,
2142
+ "step": 208,
2143
+ "train_runtime": 1250.0579,
2144
+ "train_tokens_per_second": 550.115
2145
+ },
2146
+ {
2147
+ "epoch": 1.685483870967742,
2148
+ "grad_norm": 0.5267180800437927,
2149
+ "learning_rate": 0.00018708597692970353,
2150
+ "loss": 0.38378968834877014,
2151
+ "num_input_tokens_seen": 691000,
2152
+ "step": 209,
2153
+ "train_runtime": 1254.9146,
2154
+ "train_tokens_per_second": 550.635
2155
+ },
2156
+ {
2157
+ "epoch": 1.6935483870967742,
2158
+ "grad_norm": 0.693577766418457,
2159
+ "learning_rate": 0.00018696065952036571,
2160
+ "loss": 0.46765580773353577,
2161
+ "num_input_tokens_seen": 694156,
2162
+ "step": 210,
2163
+ "train_runtime": 1259.4417,
2164
+ "train_tokens_per_second": 551.162
2165
+ },
2166
+ {
2167
+ "epoch": 1.6935483870967742,
2168
+ "eval_loss": 2.4968807697296143,
2169
+ "eval_runtime": 12.2486,
2170
+ "eval_samples_per_second": 4.245,
2171
+ "eval_steps_per_second": 2.123,
2172
+ "num_input_tokens_seen": 694156,
2173
+ "step": 210
2174
+ },
2175
+ {
2176
+ "epoch": 1.7016129032258065,
2177
+ "grad_norm": 0.5160212516784668,
2178
+ "learning_rate": 0.00018683477939612021,
2179
+ "loss": 0.383802592754364,
2180
+ "num_input_tokens_seen": 697510,
2181
+ "step": 211,
2182
+ "train_runtime": 1296.5517,
2183
+ "train_tokens_per_second": 537.973
2184
+ },
2185
+ {
2186
+ "epoch": 1.7096774193548387,
2187
+ "grad_norm": 0.5070053935050964,
2188
+ "learning_rate": 0.0001867083373715264,
2189
+ "loss": 0.39444947242736816,
2190
+ "num_input_tokens_seen": 700884,
2191
+ "step": 212,
2192
+ "train_runtime": 1301.2787,
2193
+ "train_tokens_per_second": 538.612
2194
+ },
2195
+ {
2196
+ "epoch": 1.717741935483871,
2197
+ "grad_norm": 0.6223205327987671,
2198
+ "learning_rate": 0.00018658133426477965,
2199
+ "loss": 0.4722726345062256,
2200
+ "num_input_tokens_seen": 704288,
2201
+ "step": 213,
2202
+ "train_runtime": 1306.1565,
2203
+ "train_tokens_per_second": 539.206
2204
+ },
2205
+ {
2206
+ "epoch": 1.7258064516129032,
2207
+ "grad_norm": 0.6386722326278687,
2208
+ "learning_rate": 0.00018645377089770616,
2209
+ "loss": 0.40639519691467285,
2210
+ "num_input_tokens_seen": 707404,
2211
+ "step": 214,
2212
+ "train_runtime": 1310.6997,
2213
+ "train_tokens_per_second": 539.715
2214
+ },
2215
+ {
2216
+ "epoch": 1.7338709677419355,
2217
+ "grad_norm": 0.5457852482795715,
2218
+ "learning_rate": 0.00018632564809575742,
2219
+ "loss": 0.40314507484436035,
2220
+ "num_input_tokens_seen": 710738,
2221
+ "step": 215,
2222
+ "train_runtime": 1315.6988,
2223
+ "train_tokens_per_second": 540.198
2224
+ },
2225
+ {
2226
+ "epoch": 1.7419354838709677,
2227
+ "grad_norm": 0.6622642278671265,
2228
+ "learning_rate": 0.00018619696668800492,
2229
+ "loss": 0.4788181185722351,
2230
+ "num_input_tokens_seen": 714052,
2231
+ "step": 216,
2232
+ "train_runtime": 1320.6297,
2233
+ "train_tokens_per_second": 540.691
2234
+ },
2235
+ {
2236
+ "epoch": 1.75,
2237
+ "grad_norm": 0.5140097737312317,
2238
+ "learning_rate": 0.00018606772750713504,
2239
+ "loss": 0.4071129262447357,
2240
+ "num_input_tokens_seen": 717406,
2241
+ "step": 217,
2242
+ "train_runtime": 1325.7428,
2243
+ "train_tokens_per_second": 541.135
2244
+ },
2245
+ {
2246
+ "epoch": 1.7580645161290323,
2247
+ "grad_norm": 0.5269467830657959,
2248
+ "learning_rate": 0.00018593793138944328,
2249
+ "loss": 0.4001181721687317,
2250
+ "num_input_tokens_seen": 720724,
2251
+ "step": 218,
2252
+ "train_runtime": 1330.8634,
2253
+ "train_tokens_per_second": 541.546
2254
+ },
2255
+ {
2256
+ "epoch": 1.7661290322580645,
2257
+ "grad_norm": 0.6335601806640625,
2258
+ "learning_rate": 0.0001858075791748291,
2259
+ "loss": 0.5110459923744202,
2260
+ "num_input_tokens_seen": 724156,
2261
+ "step": 219,
2262
+ "train_runtime": 1335.9177,
2263
+ "train_tokens_per_second": 542.066
2264
+ },
2265
+ {
2266
+ "epoch": 1.7741935483870968,
2267
+ "grad_norm": 0.5240700840950012,
2268
+ "learning_rate": 0.0001856766717067904,
2269
+ "loss": 0.42234110832214355,
2270
+ "num_input_tokens_seen": 727386,
2271
+ "step": 220,
2272
+ "train_runtime": 1340.9375,
2273
+ "train_tokens_per_second": 542.446
2274
+ },
2275
+ {
2276
+ "epoch": 1.782258064516129,
2277
+ "grad_norm": 0.4940207898616791,
2278
+ "learning_rate": 0.00018554520983241814,
2279
+ "loss": 0.40678343176841736,
2280
+ "num_input_tokens_seen": 730576,
2281
+ "step": 221,
2282
+ "train_runtime": 1345.6557,
2283
+ "train_tokens_per_second": 542.914
2284
+ },
2285
+ {
2286
+ "epoch": 1.7903225806451613,
2287
+ "grad_norm": 0.4842962324619293,
2288
+ "learning_rate": 0.00018541319440239066,
2289
+ "loss": 0.37585100531578064,
2290
+ "num_input_tokens_seen": 734042,
2291
+ "step": 222,
2292
+ "train_runtime": 1350.5324,
2293
+ "train_tokens_per_second": 543.52
2294
+ },
2295
+ {
2296
+ "epoch": 1.7983870967741935,
2297
+ "grad_norm": 0.5687828660011292,
2298
+ "learning_rate": 0.00018528062627096845,
2299
+ "loss": 0.4230225086212158,
2300
+ "num_input_tokens_seen": 737388,
2301
+ "step": 223,
2302
+ "train_runtime": 1355.36,
2303
+ "train_tokens_per_second": 544.053
2304
+ },
2305
+ {
2306
+ "epoch": 1.8064516129032258,
2307
+ "grad_norm": 0.5137950778007507,
2308
+ "learning_rate": 0.0001851475062959884,
2309
+ "loss": 0.4101966619491577,
2310
+ "num_input_tokens_seen": 740714,
2311
+ "step": 224,
2312
+ "train_runtime": 1359.9896,
2313
+ "train_tokens_per_second": 544.647
2314
+ },
2315
+ {
2316
+ "epoch": 1.814516129032258,
2317
+ "grad_norm": 0.5618765950202942,
2318
+ "learning_rate": 0.00018501383533885837,
2319
+ "loss": 0.3829328417778015,
2320
+ "num_input_tokens_seen": 743952,
2321
+ "step": 225,
2322
+ "train_runtime": 1364.615,
2323
+ "train_tokens_per_second": 545.174
2324
+ },
2325
+ {
2326
+ "epoch": 1.8225806451612905,
2327
+ "grad_norm": 0.6131100058555603,
2328
+ "learning_rate": 0.00018487961426455157,
2329
+ "loss": 0.4687027335166931,
2330
+ "num_input_tokens_seen": 747324,
2331
+ "step": 226,
2332
+ "train_runtime": 1369.3307,
2333
+ "train_tokens_per_second": 545.759
2334
+ },
2335
+ {
2336
+ "epoch": 1.8306451612903225,
2337
+ "grad_norm": 0.6085920929908752,
2338
+ "learning_rate": 0.0001847448439416009,
2339
+ "loss": 0.4704275131225586,
2340
+ "num_input_tokens_seen": 750660,
2341
+ "step": 227,
2342
+ "train_runtime": 1373.9767,
2343
+ "train_tokens_per_second": 546.341
2344
+ },
2345
+ {
2346
+ "epoch": 1.838709677419355,
2347
+ "grad_norm": 0.682384192943573,
2348
+ "learning_rate": 0.00018460952524209355,
2349
+ "loss": 0.4709826111793518,
2350
+ "num_input_tokens_seen": 753890,
2351
+ "step": 228,
2352
+ "train_runtime": 1378.611,
2353
+ "train_tokens_per_second": 546.848
2354
+ },
2355
+ {
2356
+ "epoch": 1.846774193548387,
2357
+ "grad_norm": 0.6233918070793152,
2358
+ "learning_rate": 0.0001844736590416651,
2359
+ "loss": 0.41492459177970886,
2360
+ "num_input_tokens_seen": 757196,
2361
+ "step": 229,
2362
+ "train_runtime": 1383.1991,
2363
+ "train_tokens_per_second": 547.424
2364
+ },
2365
+ {
2366
+ "epoch": 1.8548387096774195,
2367
+ "grad_norm": 0.6080527305603027,
2368
+ "learning_rate": 0.00018433724621949392,
2369
+ "loss": 0.41505876183509827,
2370
+ "num_input_tokens_seen": 760606,
2371
+ "step": 230,
2372
+ "train_runtime": 1387.9745,
2373
+ "train_tokens_per_second": 547.997
2374
+ },
2375
+ {
2376
+ "epoch": 1.8629032258064515,
2377
+ "grad_norm": 0.5890636444091797,
2378
+ "learning_rate": 0.00018420028765829568,
2379
+ "loss": 0.4203844964504242,
2380
+ "num_input_tokens_seen": 763910,
2381
+ "step": 231,
2382
+ "train_runtime": 1392.7824,
2383
+ "train_tokens_per_second": 548.478
2384
+ },
2385
+ {
2386
+ "epoch": 1.870967741935484,
2387
+ "grad_norm": 0.5636559724807739,
2388
+ "learning_rate": 0.00018406278424431736,
2389
+ "loss": 0.44402995705604553,
2390
+ "num_input_tokens_seen": 767166,
2391
+ "step": 232,
2392
+ "train_runtime": 1397.3628,
2393
+ "train_tokens_per_second": 549.01
2394
+ },
2395
+ {
2396
+ "epoch": 1.879032258064516,
2397
+ "grad_norm": 0.5407444834709167,
2398
+ "learning_rate": 0.00018392473686733163,
2399
+ "loss": 0.43737179040908813,
2400
+ "num_input_tokens_seen": 770592,
2401
+ "step": 233,
2402
+ "train_runtime": 1402.2872,
2403
+ "train_tokens_per_second": 549.525
2404
+ },
2405
+ {
2406
+ "epoch": 1.8870967741935485,
2407
+ "grad_norm": 0.6623079776763916,
2408
+ "learning_rate": 0.00018378614642063115,
2409
+ "loss": 0.4496549367904663,
2410
+ "num_input_tokens_seen": 773594,
2411
+ "step": 234,
2412
+ "train_runtime": 1406.7685,
2413
+ "train_tokens_per_second": 549.909
2414
+ },
2415
+ {
2416
+ "epoch": 1.8951612903225805,
2417
+ "grad_norm": 0.676179826259613,
2418
+ "learning_rate": 0.00018364701380102266,
2419
+ "loss": 0.4767211973667145,
2420
+ "num_input_tokens_seen": 776592,
2421
+ "step": 235,
2422
+ "train_runtime": 1411.1241,
2423
+ "train_tokens_per_second": 550.336
2424
+ },
2425
+ {
2426
+ "epoch": 1.903225806451613,
2427
+ "grad_norm": 0.5864387154579163,
2428
+ "learning_rate": 0.0001835073399088214,
2429
+ "loss": 0.4903409779071808,
2430
+ "num_input_tokens_seen": 779942,
2431
+ "step": 236,
2432
+ "train_runtime": 1416.1385,
2433
+ "train_tokens_per_second": 550.753
2434
+ },
2435
+ {
2436
+ "epoch": 1.911290322580645,
2437
+ "grad_norm": 0.4824385643005371,
2438
+ "learning_rate": 0.00018336712564784503,
2439
+ "loss": 0.4071444869041443,
2440
+ "num_input_tokens_seen": 783252,
2441
+ "step": 237,
2442
+ "train_runtime": 1420.8937,
2443
+ "train_tokens_per_second": 551.239
2444
+ },
2445
+ {
2446
+ "epoch": 1.9193548387096775,
2447
+ "grad_norm": 0.5865503549575806,
2448
+ "learning_rate": 0.00018322637192540785,
2449
+ "loss": 0.4477483034133911,
2450
+ "num_input_tokens_seen": 786556,
2451
+ "step": 238,
2452
+ "train_runtime": 1425.6472,
2453
+ "train_tokens_per_second": 551.719
2454
+ },
2455
+ {
2456
+ "epoch": 1.9274193548387095,
2457
+ "grad_norm": 0.6280811429023743,
2458
+ "learning_rate": 0.00018308507965231508,
2459
+ "loss": 0.4521775245666504,
2460
+ "num_input_tokens_seen": 789882,
2461
+ "step": 239,
2462
+ "train_runtime": 1430.4668,
2463
+ "train_tokens_per_second": 552.185
2464
+ },
2465
+ {
2466
+ "epoch": 1.935483870967742,
2467
+ "grad_norm": 0.5393728613853455,
2468
+ "learning_rate": 0.00018294324974285677,
2469
+ "loss": 0.3891618549823761,
2470
+ "num_input_tokens_seen": 793254,
2471
+ "step": 240,
2472
+ "train_runtime": 1435.2181,
2473
+ "train_tokens_per_second": 552.706
2474
+ },
2475
+ {
2476
+ "epoch": 1.935483870967742,
2477
+ "eval_loss": 2.3643229007720947,
2478
+ "eval_runtime": 12.1283,
2479
+ "eval_samples_per_second": 4.287,
2480
+ "eval_steps_per_second": 2.144,
2481
+ "num_input_tokens_seen": 793254,
2482
+ "step": 240
2483
+ }
2484
+ ],
2485
+ "logging_steps": 1,
2486
+ "max_steps": 1240,
2487
+ "num_input_tokens_seen": 793254,
2488
+ "num_train_epochs": 10,
2489
+ "save_steps": 30,
2490
+ "stateful_callbacks": {
2491
+ "TrainerControl": {
2492
+ "args": {
2493
+ "should_epoch_stop": false,
2494
+ "should_evaluate": false,
2495
+ "should_log": false,
2496
+ "should_save": true,
2497
+ "should_training_stop": false
2498
+ },
2499
+ "attributes": {}
2500
+ }
2501
+ },
2502
+ "total_flos": 1.1879165274109056e+16,
2503
+ "train_batch_size": 2,
2504
+ "trial_name": null,
2505
+ "trial_params": null
2506
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bbb176183e719cb4390ae273728327f28a311740b24c3049448e069def331d2d
3
+ size 5713