wmaousley commited on
Commit
dec813c
·
verified ·
1 Parent(s): 0f0df11

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
adapter_config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "Qwen/Qwen2-0.5B-Instruct",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 32,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 16,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "k_proj",
24
+ "q_proj",
25
+ "o_proj",
26
+ "v_proj"
27
+ ],
28
+ "task_type": "CAUSAL_LM",
29
+ "use_dora": false,
30
+ "use_rslora": false
31
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:79d856104b781021c43fad86f0478030885797265c8d4fffb66447b5b720f4a7
3
+ size 8676008
checkpoint-40000/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen2-0.5B-Instruct
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.13.2
checkpoint-40000/adapter_config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "Qwen/Qwen2-0.5B-Instruct",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 32,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 16,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "k_proj",
24
+ "q_proj",
25
+ "o_proj",
26
+ "v_proj"
27
+ ],
28
+ "task_type": "CAUSAL_LM",
29
+ "use_dora": false,
30
+ "use_rslora": false
31
+ }
checkpoint-40000/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:acd9947a02ce4df1a5163b8a15fb20d09a8607496e30e80d355eff380d1c5318
3
+ size 8676008
checkpoint-40000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9282d1426027801df62d80ab2f99ad7ad22f1962924811bd5beb4ccc57f70ffa
3
+ size 17463051
checkpoint-40000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:564569ea28938ad2cee9368c34eacaf7d2105aa7bf36e9bf0b4711f73c4711d7
3
+ size 14645
checkpoint-40000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d7e2760ee3e1c81b83ba7a0a9ee1ed6fc636d614401727fe15fe3af580dbab5c
3
+ size 1465
checkpoint-40000/trainer_state.json ADDED
@@ -0,0 +1,3153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.917066233979326,
5
+ "eval_steps": 1000,
6
+ "global_step": 40000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.007292483272866493,
13
+ "grad_norm": 2.1235318183898926,
14
+ "learning_rate": 4e-05,
15
+ "loss": 2.7429,
16
+ "step": 100
17
+ },
18
+ {
19
+ "epoch": 0.014584966545732986,
20
+ "grad_norm": 1.9533482789993286,
21
+ "learning_rate": 8e-05,
22
+ "loss": 1.4786,
23
+ "step": 200
24
+ },
25
+ {
26
+ "epoch": 0.02187744981859948,
27
+ "grad_norm": 1.5908012390136719,
28
+ "learning_rate": 0.00012,
29
+ "loss": 1.252,
30
+ "step": 300
31
+ },
32
+ {
33
+ "epoch": 0.029169933091465972,
34
+ "grad_norm": 1.592781662940979,
35
+ "learning_rate": 0.00016,
36
+ "loss": 1.1674,
37
+ "step": 400
38
+ },
39
+ {
40
+ "epoch": 0.036462416364332464,
41
+ "grad_norm": 1.4071415662765503,
42
+ "learning_rate": 0.0002,
43
+ "loss": 1.101,
44
+ "step": 500
45
+ },
46
+ {
47
+ "epoch": 0.04375489963719896,
48
+ "grad_norm": 1.4228886365890503,
49
+ "learning_rate": 0.0001995078255733832,
50
+ "loss": 1.0487,
51
+ "step": 600
52
+ },
53
+ {
54
+ "epoch": 0.05104738291006545,
55
+ "grad_norm": 1.2705847024917603,
56
+ "learning_rate": 0.00019901565114676642,
57
+ "loss": 1.0119,
58
+ "step": 700
59
+ },
60
+ {
61
+ "epoch": 0.058339866182931945,
62
+ "grad_norm": 1.1770137548446655,
63
+ "learning_rate": 0.00019852347672014964,
64
+ "loss": 0.9906,
65
+ "step": 800
66
+ },
67
+ {
68
+ "epoch": 0.06563234945579843,
69
+ "grad_norm": 1.1681164503097534,
70
+ "learning_rate": 0.00019803130229353283,
71
+ "loss": 0.9645,
72
+ "step": 900
73
+ },
74
+ {
75
+ "epoch": 0.07292483272866493,
76
+ "grad_norm": 1.020504117012024,
77
+ "learning_rate": 0.00019753912786691605,
78
+ "loss": 0.9525,
79
+ "step": 1000
80
+ },
81
+ {
82
+ "epoch": 0.07292483272866493,
83
+ "eval_loss": 0.9407642483711243,
84
+ "eval_runtime": 61.0906,
85
+ "eval_samples_per_second": 146.586,
86
+ "eval_steps_per_second": 18.333,
87
+ "step": 1000
88
+ },
89
+ {
90
+ "epoch": 0.08021731600153142,
91
+ "grad_norm": 1.079444408416748,
92
+ "learning_rate": 0.00019704695344029924,
93
+ "loss": 0.9414,
94
+ "step": 1100
95
+ },
96
+ {
97
+ "epoch": 0.08750979927439792,
98
+ "grad_norm": 1.057377576828003,
99
+ "learning_rate": 0.00019655477901368246,
100
+ "loss": 0.9231,
101
+ "step": 1200
102
+ },
103
+ {
104
+ "epoch": 0.0948022825472644,
105
+ "grad_norm": 1.068018913269043,
106
+ "learning_rate": 0.00019606260458706568,
107
+ "loss": 0.9168,
108
+ "step": 1300
109
+ },
110
+ {
111
+ "epoch": 0.1020947658201309,
112
+ "grad_norm": 0.9460920095443726,
113
+ "learning_rate": 0.00019557043016044887,
114
+ "loss": 0.9031,
115
+ "step": 1400
116
+ },
117
+ {
118
+ "epoch": 0.1093872490929974,
119
+ "grad_norm": 1.056226134300232,
120
+ "learning_rate": 0.00019507825573383206,
121
+ "loss": 0.8901,
122
+ "step": 1500
123
+ },
124
+ {
125
+ "epoch": 0.11667973236586389,
126
+ "grad_norm": 1.0429835319519043,
127
+ "learning_rate": 0.00019458608130721528,
128
+ "loss": 0.8928,
129
+ "step": 1600
130
+ },
131
+ {
132
+ "epoch": 0.12397221563873038,
133
+ "grad_norm": 1.050790548324585,
134
+ "learning_rate": 0.0001940939068805985,
135
+ "loss": 0.8803,
136
+ "step": 1700
137
+ },
138
+ {
139
+ "epoch": 0.13126469891159687,
140
+ "grad_norm": 0.9586555361747742,
141
+ "learning_rate": 0.0001936017324539817,
142
+ "loss": 0.8809,
143
+ "step": 1800
144
+ },
145
+ {
146
+ "epoch": 0.13855718218446336,
147
+ "grad_norm": 0.985379159450531,
148
+ "learning_rate": 0.00019310955802736491,
149
+ "loss": 0.8743,
150
+ "step": 1900
151
+ },
152
+ {
153
+ "epoch": 0.14584966545732986,
154
+ "grad_norm": 0.9307010769844055,
155
+ "learning_rate": 0.00019261738360074813,
156
+ "loss": 0.8727,
157
+ "step": 2000
158
+ },
159
+ {
160
+ "epoch": 0.14584966545732986,
161
+ "eval_loss": 0.86456698179245,
162
+ "eval_runtime": 60.6283,
163
+ "eval_samples_per_second": 147.703,
164
+ "eval_steps_per_second": 18.473,
165
+ "step": 2000
166
+ },
167
+ {
168
+ "epoch": 0.15314214873019635,
169
+ "grad_norm": 1.0384063720703125,
170
+ "learning_rate": 0.00019212520917413133,
171
+ "loss": 0.8742,
172
+ "step": 2100
173
+ },
174
+ {
175
+ "epoch": 0.16043463200306285,
176
+ "grad_norm": 0.9662402868270874,
177
+ "learning_rate": 0.00019163303474751452,
178
+ "loss": 0.8661,
179
+ "step": 2200
180
+ },
181
+ {
182
+ "epoch": 0.16772711527592934,
183
+ "grad_norm": 0.9773098230361938,
184
+ "learning_rate": 0.00019114086032089774,
185
+ "loss": 0.8576,
186
+ "step": 2300
187
+ },
188
+ {
189
+ "epoch": 0.17501959854879584,
190
+ "grad_norm": 0.9672012329101562,
191
+ "learning_rate": 0.00019064868589428093,
192
+ "loss": 0.8595,
193
+ "step": 2400
194
+ },
195
+ {
196
+ "epoch": 0.1823120818216623,
197
+ "grad_norm": 0.9758124351501465,
198
+ "learning_rate": 0.00019015651146766415,
199
+ "loss": 0.8524,
200
+ "step": 2500
201
+ },
202
+ {
203
+ "epoch": 0.1896045650945288,
204
+ "grad_norm": 0.972232460975647,
205
+ "learning_rate": 0.00018966433704104737,
206
+ "loss": 0.8468,
207
+ "step": 2600
208
+ },
209
+ {
210
+ "epoch": 0.1968970483673953,
211
+ "grad_norm": 0.9417553544044495,
212
+ "learning_rate": 0.00018917216261443056,
213
+ "loss": 0.8412,
214
+ "step": 2700
215
+ },
216
+ {
217
+ "epoch": 0.2041895316402618,
218
+ "grad_norm": 0.9395071864128113,
219
+ "learning_rate": 0.00018867998818781375,
220
+ "loss": 0.8413,
221
+ "step": 2800
222
+ },
223
+ {
224
+ "epoch": 0.2114820149131283,
225
+ "grad_norm": 0.9951208233833313,
226
+ "learning_rate": 0.000188187813761197,
227
+ "loss": 0.8345,
228
+ "step": 2900
229
+ },
230
+ {
231
+ "epoch": 0.2187744981859948,
232
+ "grad_norm": 0.9656242728233337,
233
+ "learning_rate": 0.0001876956393345802,
234
+ "loss": 0.8317,
235
+ "step": 3000
236
+ },
237
+ {
238
+ "epoch": 0.2187744981859948,
239
+ "eval_loss": 0.8318613767623901,
240
+ "eval_runtime": 61.1356,
241
+ "eval_samples_per_second": 146.478,
242
+ "eval_steps_per_second": 18.32,
243
+ "step": 3000
244
+ },
245
+ {
246
+ "epoch": 0.22606698145886128,
247
+ "grad_norm": 0.8810185194015503,
248
+ "learning_rate": 0.00018720346490796338,
249
+ "loss": 0.8321,
250
+ "step": 3100
251
+ },
252
+ {
253
+ "epoch": 0.23335946473172778,
254
+ "grad_norm": 0.9199262857437134,
255
+ "learning_rate": 0.0001867112904813466,
256
+ "loss": 0.8406,
257
+ "step": 3200
258
+ },
259
+ {
260
+ "epoch": 0.24065194800459427,
261
+ "grad_norm": 0.9557051658630371,
262
+ "learning_rate": 0.00018621911605472982,
263
+ "loss": 0.8277,
264
+ "step": 3300
265
+ },
266
+ {
267
+ "epoch": 0.24794443127746077,
268
+ "grad_norm": 0.9777804017066956,
269
+ "learning_rate": 0.000185726941628113,
270
+ "loss": 0.8272,
271
+ "step": 3400
272
+ },
273
+ {
274
+ "epoch": 0.25523691455032727,
275
+ "grad_norm": 0.8856322169303894,
276
+ "learning_rate": 0.00018523476720149623,
277
+ "loss": 0.8256,
278
+ "step": 3500
279
+ },
280
+ {
281
+ "epoch": 0.26252939782319373,
282
+ "grad_norm": 0.9196017980575562,
283
+ "learning_rate": 0.00018474259277487942,
284
+ "loss": 0.8234,
285
+ "step": 3600
286
+ },
287
+ {
288
+ "epoch": 0.26982188109606026,
289
+ "grad_norm": 0.9568464159965515,
290
+ "learning_rate": 0.00018425041834826264,
291
+ "loss": 0.8193,
292
+ "step": 3700
293
+ },
294
+ {
295
+ "epoch": 0.2771143643689267,
296
+ "grad_norm": 0.9552770256996155,
297
+ "learning_rate": 0.00018375824392164583,
298
+ "loss": 0.8179,
299
+ "step": 3800
300
+ },
301
+ {
302
+ "epoch": 0.28440684764179325,
303
+ "grad_norm": 0.8997077345848083,
304
+ "learning_rate": 0.00018326606949502905,
305
+ "loss": 0.8138,
306
+ "step": 3900
307
+ },
308
+ {
309
+ "epoch": 0.2916993309146597,
310
+ "grad_norm": 0.8896480202674866,
311
+ "learning_rate": 0.00018277389506841224,
312
+ "loss": 0.8172,
313
+ "step": 4000
314
+ },
315
+ {
316
+ "epoch": 0.2916993309146597,
317
+ "eval_loss": 0.8123040199279785,
318
+ "eval_runtime": 60.7914,
319
+ "eval_samples_per_second": 147.307,
320
+ "eval_steps_per_second": 18.424,
321
+ "step": 4000
322
+ },
323
+ {
324
+ "epoch": 0.2989918141875262,
325
+ "grad_norm": 0.9520764350891113,
326
+ "learning_rate": 0.00018228172064179546,
327
+ "loss": 0.8183,
328
+ "step": 4100
329
+ },
330
+ {
331
+ "epoch": 0.3062842974603927,
332
+ "grad_norm": 0.9373065233230591,
333
+ "learning_rate": 0.00018178954621517868,
334
+ "loss": 0.8132,
335
+ "step": 4200
336
+ },
337
+ {
338
+ "epoch": 0.3135767807332592,
339
+ "grad_norm": 0.8733066916465759,
340
+ "learning_rate": 0.00018129737178856187,
341
+ "loss": 0.811,
342
+ "step": 4300
343
+ },
344
+ {
345
+ "epoch": 0.3208692640061257,
346
+ "grad_norm": 0.8866516351699829,
347
+ "learning_rate": 0.00018080519736194507,
348
+ "loss": 0.8093,
349
+ "step": 4400
350
+ },
351
+ {
352
+ "epoch": 0.32816174727899217,
353
+ "grad_norm": 0.9394953846931458,
354
+ "learning_rate": 0.00018031302293532828,
355
+ "loss": 0.8035,
356
+ "step": 4500
357
+ },
358
+ {
359
+ "epoch": 0.3354542305518587,
360
+ "grad_norm": 0.9133720993995667,
361
+ "learning_rate": 0.0001798208485087115,
362
+ "loss": 0.8054,
363
+ "step": 4600
364
+ },
365
+ {
366
+ "epoch": 0.34274671382472516,
367
+ "grad_norm": 0.9428606629371643,
368
+ "learning_rate": 0.0001793286740820947,
369
+ "loss": 0.8076,
370
+ "step": 4700
371
+ },
372
+ {
373
+ "epoch": 0.3500391970975917,
374
+ "grad_norm": 0.8996593356132507,
375
+ "learning_rate": 0.00017883649965547792,
376
+ "loss": 0.812,
377
+ "step": 4800
378
+ },
379
+ {
380
+ "epoch": 0.35733168037045815,
381
+ "grad_norm": 0.9113749265670776,
382
+ "learning_rate": 0.0001783443252288611,
383
+ "loss": 0.8048,
384
+ "step": 4900
385
+ },
386
+ {
387
+ "epoch": 0.3646241636433246,
388
+ "grad_norm": 0.9185646176338196,
389
+ "learning_rate": 0.00017785215080224433,
390
+ "loss": 0.8023,
391
+ "step": 5000
392
+ },
393
+ {
394
+ "epoch": 0.3646241636433246,
395
+ "eval_loss": 0.7973803877830505,
396
+ "eval_runtime": 60.8068,
397
+ "eval_samples_per_second": 147.27,
398
+ "eval_steps_per_second": 18.419,
399
+ "step": 5000
400
+ },
401
+ {
402
+ "epoch": 0.37191664691619114,
403
+ "grad_norm": 0.8994658589363098,
404
+ "learning_rate": 0.00017735997637562755,
405
+ "loss": 0.8089,
406
+ "step": 5100
407
+ },
408
+ {
409
+ "epoch": 0.3792091301890576,
410
+ "grad_norm": 0.8724523782730103,
411
+ "learning_rate": 0.00017686780194901074,
412
+ "loss": 0.8015,
413
+ "step": 5200
414
+ },
415
+ {
416
+ "epoch": 0.38650161346192413,
417
+ "grad_norm": 0.8285540342330933,
418
+ "learning_rate": 0.00017637562752239393,
419
+ "loss": 0.7944,
420
+ "step": 5300
421
+ },
422
+ {
423
+ "epoch": 0.3937940967347906,
424
+ "grad_norm": 0.8982509970664978,
425
+ "learning_rate": 0.00017588345309577718,
426
+ "loss": 0.7952,
427
+ "step": 5400
428
+ },
429
+ {
430
+ "epoch": 0.4010865800076571,
431
+ "grad_norm": 0.9266172051429749,
432
+ "learning_rate": 0.00017539127866916037,
433
+ "loss": 0.7978,
434
+ "step": 5500
435
+ },
436
+ {
437
+ "epoch": 0.4083790632805236,
438
+ "grad_norm": 0.901662290096283,
439
+ "learning_rate": 0.00017489910424254356,
440
+ "loss": 0.7966,
441
+ "step": 5600
442
+ },
443
+ {
444
+ "epoch": 0.4156715465533901,
445
+ "grad_norm": 0.9309051036834717,
446
+ "learning_rate": 0.00017440692981592678,
447
+ "loss": 0.7975,
448
+ "step": 5700
449
+ },
450
+ {
451
+ "epoch": 0.4229640298262566,
452
+ "grad_norm": 0.8789328336715698,
453
+ "learning_rate": 0.00017391475538930997,
454
+ "loss": 0.7997,
455
+ "step": 5800
456
+ },
457
+ {
458
+ "epoch": 0.4302565130991231,
459
+ "grad_norm": 0.8636139035224915,
460
+ "learning_rate": 0.0001734225809626932,
461
+ "loss": 0.7914,
462
+ "step": 5900
463
+ },
464
+ {
465
+ "epoch": 0.4375489963719896,
466
+ "grad_norm": 0.9468287229537964,
467
+ "learning_rate": 0.00017293040653607638,
468
+ "loss": 0.7859,
469
+ "step": 6000
470
+ },
471
+ {
472
+ "epoch": 0.4375489963719896,
473
+ "eval_loss": 0.7869976162910461,
474
+ "eval_runtime": 60.7741,
475
+ "eval_samples_per_second": 147.349,
476
+ "eval_steps_per_second": 18.429,
477
+ "step": 6000
478
+ },
479
+ {
480
+ "epoch": 0.44484147964485604,
481
+ "grad_norm": 0.867158055305481,
482
+ "learning_rate": 0.0001724382321094596,
483
+ "loss": 0.7924,
484
+ "step": 6100
485
+ },
486
+ {
487
+ "epoch": 0.45213396291772256,
488
+ "grad_norm": 0.9379836320877075,
489
+ "learning_rate": 0.0001719460576828428,
490
+ "loss": 0.7902,
491
+ "step": 6200
492
+ },
493
+ {
494
+ "epoch": 0.45942644619058903,
495
+ "grad_norm": 0.8591951727867126,
496
+ "learning_rate": 0.000171453883256226,
497
+ "loss": 0.7926,
498
+ "step": 6300
499
+ },
500
+ {
501
+ "epoch": 0.46671892946345556,
502
+ "grad_norm": 0.9702317118644714,
503
+ "learning_rate": 0.00017096170882960923,
504
+ "loss": 0.7867,
505
+ "step": 6400
506
+ },
507
+ {
508
+ "epoch": 0.474011412736322,
509
+ "grad_norm": 0.902302086353302,
510
+ "learning_rate": 0.00017046953440299242,
511
+ "loss": 0.7897,
512
+ "step": 6500
513
+ },
514
+ {
515
+ "epoch": 0.48130389600918855,
516
+ "grad_norm": 0.889926552772522,
517
+ "learning_rate": 0.00016997735997637561,
518
+ "loss": 0.7857,
519
+ "step": 6600
520
+ },
521
+ {
522
+ "epoch": 0.488596379282055,
523
+ "grad_norm": 0.8906420469284058,
524
+ "learning_rate": 0.00016948518554975886,
525
+ "loss": 0.7878,
526
+ "step": 6700
527
+ },
528
+ {
529
+ "epoch": 0.49588886255492154,
530
+ "grad_norm": 0.919983983039856,
531
+ "learning_rate": 0.00016899301112314205,
532
+ "loss": 0.7876,
533
+ "step": 6800
534
+ },
535
+ {
536
+ "epoch": 0.5031813458277881,
537
+ "grad_norm": 0.8610624670982361,
538
+ "learning_rate": 0.00016850083669652524,
539
+ "loss": 0.7923,
540
+ "step": 6900
541
+ },
542
+ {
543
+ "epoch": 0.5104738291006545,
544
+ "grad_norm": 0.9339637160301208,
545
+ "learning_rate": 0.00016800866226990846,
546
+ "loss": 0.7837,
547
+ "step": 7000
548
+ },
549
+ {
550
+ "epoch": 0.5104738291006545,
551
+ "eval_loss": 0.7791191935539246,
552
+ "eval_runtime": 60.8878,
553
+ "eval_samples_per_second": 147.074,
554
+ "eval_steps_per_second": 18.395,
555
+ "step": 7000
556
+ },
557
+ {
558
+ "epoch": 0.517766312373521,
559
+ "grad_norm": 0.9073446393013,
560
+ "learning_rate": 0.00016751648784329168,
561
+ "loss": 0.7809,
562
+ "step": 7100
563
+ },
564
+ {
565
+ "epoch": 0.5250587956463875,
566
+ "grad_norm": 0.9348235726356506,
567
+ "learning_rate": 0.00016702431341667487,
568
+ "loss": 0.7793,
569
+ "step": 7200
570
+ },
571
+ {
572
+ "epoch": 0.5323512789192539,
573
+ "grad_norm": 0.9155163168907166,
574
+ "learning_rate": 0.0001665321389900581,
575
+ "loss": 0.7821,
576
+ "step": 7300
577
+ },
578
+ {
579
+ "epoch": 0.5396437621921205,
580
+ "grad_norm": 0.9328250885009766,
581
+ "learning_rate": 0.00016603996456344129,
582
+ "loss": 0.7806,
583
+ "step": 7400
584
+ },
585
+ {
586
+ "epoch": 0.546936245464987,
587
+ "grad_norm": 0.8911275863647461,
588
+ "learning_rate": 0.00016554779013682448,
589
+ "loss": 0.7782,
590
+ "step": 7500
591
+ },
592
+ {
593
+ "epoch": 0.5542287287378534,
594
+ "grad_norm": 0.8989250659942627,
595
+ "learning_rate": 0.00016505561571020772,
596
+ "loss": 0.779,
597
+ "step": 7600
598
+ },
599
+ {
600
+ "epoch": 0.5615212120107199,
601
+ "grad_norm": 0.8869723081588745,
602
+ "learning_rate": 0.00016456344128359092,
603
+ "loss": 0.7822,
604
+ "step": 7700
605
+ },
606
+ {
607
+ "epoch": 0.5688136952835865,
608
+ "grad_norm": 0.8631371259689331,
609
+ "learning_rate": 0.0001640712668569741,
610
+ "loss": 0.7768,
611
+ "step": 7800
612
+ },
613
+ {
614
+ "epoch": 0.576106178556453,
615
+ "grad_norm": 0.8868420720100403,
616
+ "learning_rate": 0.00016357909243035733,
617
+ "loss": 0.7834,
618
+ "step": 7900
619
+ },
620
+ {
621
+ "epoch": 0.5833986618293194,
622
+ "grad_norm": 0.9253202080726624,
623
+ "learning_rate": 0.00016308691800374055,
624
+ "loss": 0.773,
625
+ "step": 8000
626
+ },
627
+ {
628
+ "epoch": 0.5833986618293194,
629
+ "eval_loss": 0.7733862400054932,
630
+ "eval_runtime": 60.8911,
631
+ "eval_samples_per_second": 147.066,
632
+ "eval_steps_per_second": 18.394,
633
+ "step": 8000
634
+ },
635
+ {
636
+ "epoch": 0.5906911451021859,
637
+ "grad_norm": 0.830760657787323,
638
+ "learning_rate": 0.00016259474357712374,
639
+ "loss": 0.7756,
640
+ "step": 8100
641
+ },
642
+ {
643
+ "epoch": 0.5979836283750524,
644
+ "grad_norm": 0.9371838569641113,
645
+ "learning_rate": 0.00016210256915050696,
646
+ "loss": 0.776,
647
+ "step": 8200
648
+ },
649
+ {
650
+ "epoch": 0.605276111647919,
651
+ "grad_norm": 0.8486947417259216,
652
+ "learning_rate": 0.00016161039472389015,
653
+ "loss": 0.7758,
654
+ "step": 8300
655
+ },
656
+ {
657
+ "epoch": 0.6125685949207854,
658
+ "grad_norm": 0.8888623118400574,
659
+ "learning_rate": 0.00016111822029727337,
660
+ "loss": 0.783,
661
+ "step": 8400
662
+ },
663
+ {
664
+ "epoch": 0.6198610781936519,
665
+ "grad_norm": 0.9176976084709167,
666
+ "learning_rate": 0.00016062604587065656,
667
+ "loss": 0.7782,
668
+ "step": 8500
669
+ },
670
+ {
671
+ "epoch": 0.6271535614665184,
672
+ "grad_norm": 0.90993732213974,
673
+ "learning_rate": 0.00016013387144403978,
674
+ "loss": 0.7741,
675
+ "step": 8600
676
+ },
677
+ {
678
+ "epoch": 0.6344460447393849,
679
+ "grad_norm": 0.8461544513702393,
680
+ "learning_rate": 0.00015964169701742297,
681
+ "loss": 0.7782,
682
+ "step": 8700
683
+ },
684
+ {
685
+ "epoch": 0.6417385280122514,
686
+ "grad_norm": 0.8642047643661499,
687
+ "learning_rate": 0.0001591495225908062,
688
+ "loss": 0.7706,
689
+ "step": 8800
690
+ },
691
+ {
692
+ "epoch": 0.6490310112851179,
693
+ "grad_norm": 0.8944571018218994,
694
+ "learning_rate": 0.0001586573481641894,
695
+ "loss": 0.7727,
696
+ "step": 8900
697
+ },
698
+ {
699
+ "epoch": 0.6563234945579843,
700
+ "grad_norm": 0.9075286984443665,
701
+ "learning_rate": 0.0001581651737375726,
702
+ "loss": 0.7748,
703
+ "step": 9000
704
+ },
705
+ {
706
+ "epoch": 0.6563234945579843,
707
+ "eval_loss": 0.7666329741477966,
708
+ "eval_runtime": 60.5924,
709
+ "eval_samples_per_second": 147.791,
710
+ "eval_steps_per_second": 18.484,
711
+ "step": 9000
712
+ },
713
+ {
714
+ "epoch": 0.6636159778308508,
715
+ "grad_norm": 0.9164955615997314,
716
+ "learning_rate": 0.0001576729993109558,
717
+ "loss": 0.7792,
718
+ "step": 9100
719
+ },
720
+ {
721
+ "epoch": 0.6709084611037174,
722
+ "grad_norm": 0.8446054458618164,
723
+ "learning_rate": 0.000157180824884339,
724
+ "loss": 0.7661,
725
+ "step": 9200
726
+ },
727
+ {
728
+ "epoch": 0.6782009443765838,
729
+ "grad_norm": 0.8793991804122925,
730
+ "learning_rate": 0.00015668865045772223,
731
+ "loss": 0.7678,
732
+ "step": 9300
733
+ },
734
+ {
735
+ "epoch": 0.6854934276494503,
736
+ "grad_norm": 0.8772592544555664,
737
+ "learning_rate": 0.00015619647603110542,
738
+ "loss": 0.7708,
739
+ "step": 9400
740
+ },
741
+ {
742
+ "epoch": 0.6927859109223168,
743
+ "grad_norm": 0.854118824005127,
744
+ "learning_rate": 0.00015570430160448864,
745
+ "loss": 0.7616,
746
+ "step": 9500
747
+ },
748
+ {
749
+ "epoch": 0.7000783941951834,
750
+ "grad_norm": 0.8653910756111145,
751
+ "learning_rate": 0.00015521212717787183,
752
+ "loss": 0.767,
753
+ "step": 9600
754
+ },
755
+ {
756
+ "epoch": 0.7073708774680498,
757
+ "grad_norm": 0.8890120387077332,
758
+ "learning_rate": 0.00015471995275125505,
759
+ "loss": 0.7657,
760
+ "step": 9700
761
+ },
762
+ {
763
+ "epoch": 0.7146633607409163,
764
+ "grad_norm": 0.8451828360557556,
765
+ "learning_rate": 0.00015422777832463827,
766
+ "loss": 0.7656,
767
+ "step": 9800
768
+ },
769
+ {
770
+ "epoch": 0.7219558440137828,
771
+ "grad_norm": 0.9029329419136047,
772
+ "learning_rate": 0.00015373560389802146,
773
+ "loss": 0.7749,
774
+ "step": 9900
775
+ },
776
+ {
777
+ "epoch": 0.7292483272866492,
778
+ "grad_norm": 0.8538834452629089,
779
+ "learning_rate": 0.00015324342947140466,
780
+ "loss": 0.763,
781
+ "step": 10000
782
+ },
783
+ {
784
+ "epoch": 0.7292483272866492,
785
+ "eval_loss": 0.76123046875,
786
+ "eval_runtime": 60.847,
787
+ "eval_samples_per_second": 147.172,
788
+ "eval_steps_per_second": 18.407,
789
+ "step": 10000
790
+ },
791
+ {
792
+ "epoch": 0.7365408105595158,
793
+ "grad_norm": 0.8594367504119873,
794
+ "learning_rate": 0.00015275125504478788,
795
+ "loss": 0.7693,
796
+ "step": 10100
797
+ },
798
+ {
799
+ "epoch": 0.7438332938323823,
800
+ "grad_norm": 0.8748040199279785,
801
+ "learning_rate": 0.0001522590806181711,
802
+ "loss": 0.7684,
803
+ "step": 10200
804
+ },
805
+ {
806
+ "epoch": 0.7511257771052487,
807
+ "grad_norm": 0.9177483320236206,
808
+ "learning_rate": 0.0001517669061915543,
809
+ "loss": 0.7599,
810
+ "step": 10300
811
+ },
812
+ {
813
+ "epoch": 0.7584182603781152,
814
+ "grad_norm": 0.8988757729530334,
815
+ "learning_rate": 0.0001512747317649375,
816
+ "loss": 0.7648,
817
+ "step": 10400
818
+ },
819
+ {
820
+ "epoch": 0.7657107436509818,
821
+ "grad_norm": 0.8735676407814026,
822
+ "learning_rate": 0.00015078255733832073,
823
+ "loss": 0.7656,
824
+ "step": 10500
825
+ },
826
+ {
827
+ "epoch": 0.7730032269238483,
828
+ "grad_norm": 0.8750614523887634,
829
+ "learning_rate": 0.00015029038291170392,
830
+ "loss": 0.7632,
831
+ "step": 10600
832
+ },
833
+ {
834
+ "epoch": 0.7802957101967147,
835
+ "grad_norm": 0.8786306381225586,
836
+ "learning_rate": 0.0001497982084850871,
837
+ "loss": 0.7659,
838
+ "step": 10700
839
+ },
840
+ {
841
+ "epoch": 0.7875881934695812,
842
+ "grad_norm": 0.811834990978241,
843
+ "learning_rate": 0.00014930603405847033,
844
+ "loss": 0.7652,
845
+ "step": 10800
846
+ },
847
+ {
848
+ "epoch": 0.7948806767424477,
849
+ "grad_norm": 0.8844282031059265,
850
+ "learning_rate": 0.00014881385963185352,
851
+ "loss": 0.7623,
852
+ "step": 10900
853
+ },
854
+ {
855
+ "epoch": 0.8021731600153142,
856
+ "grad_norm": 0.8444844484329224,
857
+ "learning_rate": 0.00014832168520523674,
858
+ "loss": 0.7622,
859
+ "step": 11000
860
+ },
861
+ {
862
+ "epoch": 0.8021731600153142,
863
+ "eval_loss": 0.75812828540802,
864
+ "eval_runtime": 60.7569,
865
+ "eval_samples_per_second": 147.391,
866
+ "eval_steps_per_second": 18.434,
867
+ "step": 11000
868
+ },
869
+ {
870
+ "epoch": 0.8094656432881807,
871
+ "grad_norm": 0.8396947979927063,
872
+ "learning_rate": 0.00014782951077861996,
873
+ "loss": 0.7673,
874
+ "step": 11100
875
+ },
876
+ {
877
+ "epoch": 0.8167581265610472,
878
+ "grad_norm": 0.8890758752822876,
879
+ "learning_rate": 0.00014733733635200315,
880
+ "loss": 0.7551,
881
+ "step": 11200
882
+ },
883
+ {
884
+ "epoch": 0.8240506098339136,
885
+ "grad_norm": 0.8038908839225769,
886
+ "learning_rate": 0.00014684516192538634,
887
+ "loss": 0.7612,
888
+ "step": 11300
889
+ },
890
+ {
891
+ "epoch": 0.8313430931067802,
892
+ "grad_norm": 0.8224745392799377,
893
+ "learning_rate": 0.0001463529874987696,
894
+ "loss": 0.7618,
895
+ "step": 11400
896
+ },
897
+ {
898
+ "epoch": 0.8386355763796467,
899
+ "grad_norm": 0.8691264390945435,
900
+ "learning_rate": 0.00014586081307215278,
901
+ "loss": 0.7618,
902
+ "step": 11500
903
+ },
904
+ {
905
+ "epoch": 0.8459280596525132,
906
+ "grad_norm": 0.8442777395248413,
907
+ "learning_rate": 0.00014536863864553597,
908
+ "loss": 0.7671,
909
+ "step": 11600
910
+ },
911
+ {
912
+ "epoch": 0.8532205429253796,
913
+ "grad_norm": 0.8520532846450806,
914
+ "learning_rate": 0.0001448764642189192,
915
+ "loss": 0.7625,
916
+ "step": 11700
917
+ },
918
+ {
919
+ "epoch": 0.8605130261982462,
920
+ "grad_norm": 0.908760666847229,
921
+ "learning_rate": 0.0001443842897923024,
922
+ "loss": 0.7615,
923
+ "step": 11800
924
+ },
925
+ {
926
+ "epoch": 0.8678055094711127,
927
+ "grad_norm": 0.8004080057144165,
928
+ "learning_rate": 0.0001438921153656856,
929
+ "loss": 0.7632,
930
+ "step": 11900
931
+ },
932
+ {
933
+ "epoch": 0.8750979927439791,
934
+ "grad_norm": 0.8449864983558655,
935
+ "learning_rate": 0.00014339994093906882,
936
+ "loss": 0.7574,
937
+ "step": 12000
938
+ },
939
+ {
940
+ "epoch": 0.8750979927439791,
941
+ "eval_loss": 0.752128005027771,
942
+ "eval_runtime": 61.1399,
943
+ "eval_samples_per_second": 146.467,
944
+ "eval_steps_per_second": 18.319,
945
+ "step": 12000
946
+ },
947
+ {
948
+ "epoch": 0.8823904760168456,
949
+ "grad_norm": 0.8218274116516113,
950
+ "learning_rate": 0.00014290776651245201,
951
+ "loss": 0.7555,
952
+ "step": 12100
953
+ },
954
+ {
955
+ "epoch": 0.8896829592897121,
956
+ "grad_norm": 0.8944920897483826,
957
+ "learning_rate": 0.00014241559208583523,
958
+ "loss": 0.7594,
959
+ "step": 12200
960
+ },
961
+ {
962
+ "epoch": 0.8969754425625787,
963
+ "grad_norm": 0.9254937767982483,
964
+ "learning_rate": 0.00014192341765921845,
965
+ "loss": 0.7598,
966
+ "step": 12300
967
+ },
968
+ {
969
+ "epoch": 0.9042679258354451,
970
+ "grad_norm": 0.8887091875076294,
971
+ "learning_rate": 0.00014143124323260164,
972
+ "loss": 0.7625,
973
+ "step": 12400
974
+ },
975
+ {
976
+ "epoch": 0.9115604091083116,
977
+ "grad_norm": 0.8478124737739563,
978
+ "learning_rate": 0.00014093906880598484,
979
+ "loss": 0.756,
980
+ "step": 12500
981
+ },
982
+ {
983
+ "epoch": 0.9188528923811781,
984
+ "grad_norm": 0.9377927780151367,
985
+ "learning_rate": 0.00014044689437936805,
986
+ "loss": 0.7606,
987
+ "step": 12600
988
+ },
989
+ {
990
+ "epoch": 0.9261453756540446,
991
+ "grad_norm": 0.838175892829895,
992
+ "learning_rate": 0.00013995471995275127,
993
+ "loss": 0.7605,
994
+ "step": 12700
995
+ },
996
+ {
997
+ "epoch": 0.9334378589269111,
998
+ "grad_norm": 0.8345216512680054,
999
+ "learning_rate": 0.00013946254552613447,
1000
+ "loss": 0.7568,
1001
+ "step": 12800
1002
+ },
1003
+ {
1004
+ "epoch": 0.9407303421997776,
1005
+ "grad_norm": 0.894477367401123,
1006
+ "learning_rate": 0.00013897037109951766,
1007
+ "loss": 0.7535,
1008
+ "step": 12900
1009
+ },
1010
+ {
1011
+ "epoch": 0.948022825472644,
1012
+ "grad_norm": 0.849010169506073,
1013
+ "learning_rate": 0.00013847819667290088,
1014
+ "loss": 0.7465,
1015
+ "step": 13000
1016
+ },
1017
+ {
1018
+ "epoch": 0.948022825472644,
1019
+ "eval_loss": 0.7492165565490723,
1020
+ "eval_runtime": 60.7079,
1021
+ "eval_samples_per_second": 147.51,
1022
+ "eval_steps_per_second": 18.449,
1023
+ "step": 13000
1024
+ },
1025
+ {
1026
+ "epoch": 0.9553153087455105,
1027
+ "grad_norm": 0.8754207491874695,
1028
+ "learning_rate": 0.0001379860222462841,
1029
+ "loss": 0.7576,
1030
+ "step": 13100
1031
+ },
1032
+ {
1033
+ "epoch": 0.9626077920183771,
1034
+ "grad_norm": 0.8984807133674622,
1035
+ "learning_rate": 0.0001374938478196673,
1036
+ "loss": 0.7493,
1037
+ "step": 13200
1038
+ },
1039
+ {
1040
+ "epoch": 0.9699002752912436,
1041
+ "grad_norm": 0.8458361029624939,
1042
+ "learning_rate": 0.0001370016733930505,
1043
+ "loss": 0.7468,
1044
+ "step": 13300
1045
+ },
1046
+ {
1047
+ "epoch": 0.97719275856411,
1048
+ "grad_norm": 0.9169609546661377,
1049
+ "learning_rate": 0.0001365094989664337,
1050
+ "loss": 0.7515,
1051
+ "step": 13400
1052
+ },
1053
+ {
1054
+ "epoch": 0.9844852418369765,
1055
+ "grad_norm": 0.8027638792991638,
1056
+ "learning_rate": 0.00013601732453981692,
1057
+ "loss": 0.7551,
1058
+ "step": 13500
1059
+ },
1060
+ {
1061
+ "epoch": 0.9917777251098431,
1062
+ "grad_norm": 0.8572927117347717,
1063
+ "learning_rate": 0.00013552515011320014,
1064
+ "loss": 0.7481,
1065
+ "step": 13600
1066
+ },
1067
+ {
1068
+ "epoch": 0.9990702083827095,
1069
+ "grad_norm": 0.8624053001403809,
1070
+ "learning_rate": 0.00013503297568658333,
1071
+ "loss": 0.7481,
1072
+ "step": 13700
1073
+ },
1074
+ {
1075
+ "epoch": 1.0063991540719404,
1076
+ "grad_norm": 0.8915347456932068,
1077
+ "learning_rate": 0.00013454080125996652,
1078
+ "loss": 0.7463,
1079
+ "step": 13800
1080
+ },
1081
+ {
1082
+ "epoch": 1.0136916373448068,
1083
+ "grad_norm": 0.8233557939529419,
1084
+ "learning_rate": 0.00013404862683334977,
1085
+ "loss": 0.7398,
1086
+ "step": 13900
1087
+ },
1088
+ {
1089
+ "epoch": 1.0209841206176733,
1090
+ "grad_norm": 0.8467598557472229,
1091
+ "learning_rate": 0.00013355645240673296,
1092
+ "loss": 0.7402,
1093
+ "step": 14000
1094
+ },
1095
+ {
1096
+ "epoch": 1.0209841206176733,
1097
+ "eval_loss": 0.7458442449569702,
1098
+ "eval_runtime": 60.6887,
1099
+ "eval_samples_per_second": 147.556,
1100
+ "eval_steps_per_second": 18.455,
1101
+ "step": 14000
1102
+ },
1103
+ {
1104
+ "epoch": 1.0282766038905398,
1105
+ "grad_norm": 0.852739691734314,
1106
+ "learning_rate": 0.00013306427798011615,
1107
+ "loss": 0.7436,
1108
+ "step": 14100
1109
+ },
1110
+ {
1111
+ "epoch": 1.0355690871634062,
1112
+ "grad_norm": 0.8501101136207581,
1113
+ "learning_rate": 0.00013257210355349937,
1114
+ "loss": 0.7472,
1115
+ "step": 14200
1116
+ },
1117
+ {
1118
+ "epoch": 1.0428615704362727,
1119
+ "grad_norm": 0.8830447793006897,
1120
+ "learning_rate": 0.0001320799291268826,
1121
+ "loss": 0.7438,
1122
+ "step": 14300
1123
+ },
1124
+ {
1125
+ "epoch": 1.0501540537091394,
1126
+ "grad_norm": 0.8827272057533264,
1127
+ "learning_rate": 0.00013158775470026578,
1128
+ "loss": 0.7439,
1129
+ "step": 14400
1130
+ },
1131
+ {
1132
+ "epoch": 1.0574465369820059,
1133
+ "grad_norm": 0.7875618934631348,
1134
+ "learning_rate": 0.000131095580273649,
1135
+ "loss": 0.7426,
1136
+ "step": 14500
1137
+ },
1138
+ {
1139
+ "epoch": 1.0647390202548723,
1140
+ "grad_norm": 0.9906949996948242,
1141
+ "learning_rate": 0.0001306034058470322,
1142
+ "loss": 0.7418,
1143
+ "step": 14600
1144
+ },
1145
+ {
1146
+ "epoch": 1.0720315035277388,
1147
+ "grad_norm": 0.8803852200508118,
1148
+ "learning_rate": 0.00013011123142041538,
1149
+ "loss": 0.7421,
1150
+ "step": 14700
1151
+ },
1152
+ {
1153
+ "epoch": 1.0793239868006053,
1154
+ "grad_norm": 0.8951194286346436,
1155
+ "learning_rate": 0.0001296190569937986,
1156
+ "loss": 0.7429,
1157
+ "step": 14800
1158
+ },
1159
+ {
1160
+ "epoch": 1.0866164700734717,
1161
+ "grad_norm": 0.8548495769500732,
1162
+ "learning_rate": 0.00012912688256718182,
1163
+ "loss": 0.7462,
1164
+ "step": 14900
1165
+ },
1166
+ {
1167
+ "epoch": 1.0939089533463382,
1168
+ "grad_norm": 0.9326722025871277,
1169
+ "learning_rate": 0.00012863470814056501,
1170
+ "loss": 0.7515,
1171
+ "step": 15000
1172
+ },
1173
+ {
1174
+ "epoch": 1.0939089533463382,
1175
+ "eval_loss": 0.7423983812332153,
1176
+ "eval_runtime": 61.1091,
1177
+ "eval_samples_per_second": 146.541,
1178
+ "eval_steps_per_second": 18.328,
1179
+ "step": 15000
1180
+ },
1181
+ {
1182
+ "epoch": 1.1012014366192047,
1183
+ "grad_norm": 0.8803513646125793,
1184
+ "learning_rate": 0.00012814253371394823,
1185
+ "loss": 0.7369,
1186
+ "step": 15100
1187
+ },
1188
+ {
1189
+ "epoch": 1.1084939198920711,
1190
+ "grad_norm": 0.8555076122283936,
1191
+ "learning_rate": 0.00012765035928733145,
1192
+ "loss": 0.7414,
1193
+ "step": 15200
1194
+ },
1195
+ {
1196
+ "epoch": 1.1157864031649378,
1197
+ "grad_norm": 0.8760358691215515,
1198
+ "learning_rate": 0.00012715818486071464,
1199
+ "loss": 0.741,
1200
+ "step": 15300
1201
+ },
1202
+ {
1203
+ "epoch": 1.1230788864378043,
1204
+ "grad_norm": 0.8444579839706421,
1205
+ "learning_rate": 0.00012666601043409784,
1206
+ "loss": 0.7448,
1207
+ "step": 15400
1208
+ },
1209
+ {
1210
+ "epoch": 1.1303713697106708,
1211
+ "grad_norm": 0.8995528221130371,
1212
+ "learning_rate": 0.00012617383600748106,
1213
+ "loss": 0.7436,
1214
+ "step": 15500
1215
+ },
1216
+ {
1217
+ "epoch": 1.1376638529835372,
1218
+ "grad_norm": 0.8966475129127502,
1219
+ "learning_rate": 0.00012568166158086427,
1220
+ "loss": 0.7485,
1221
+ "step": 15600
1222
+ },
1223
+ {
1224
+ "epoch": 1.1449563362564037,
1225
+ "grad_norm": 0.8527953028678894,
1226
+ "learning_rate": 0.00012518948715424747,
1227
+ "loss": 0.7303,
1228
+ "step": 15700
1229
+ },
1230
+ {
1231
+ "epoch": 1.1522488195292702,
1232
+ "grad_norm": 0.8657513856887817,
1233
+ "learning_rate": 0.00012469731272763069,
1234
+ "loss": 0.7431,
1235
+ "step": 15800
1236
+ },
1237
+ {
1238
+ "epoch": 1.1595413028021366,
1239
+ "grad_norm": 0.8745185136795044,
1240
+ "learning_rate": 0.00012420513830101388,
1241
+ "loss": 0.7426,
1242
+ "step": 15900
1243
+ },
1244
+ {
1245
+ "epoch": 1.166833786075003,
1246
+ "grad_norm": 0.8729378581047058,
1247
+ "learning_rate": 0.0001237129638743971,
1248
+ "loss": 0.7389,
1249
+ "step": 16000
1250
+ },
1251
+ {
1252
+ "epoch": 1.166833786075003,
1253
+ "eval_loss": 0.740699291229248,
1254
+ "eval_runtime": 60.635,
1255
+ "eval_samples_per_second": 147.687,
1256
+ "eval_steps_per_second": 18.471,
1257
+ "step": 16000
1258
+ },
1259
+ {
1260
+ "epoch": 1.1741262693478696,
1261
+ "grad_norm": 0.8877021670341492,
1262
+ "learning_rate": 0.00012322078944778032,
1263
+ "loss": 0.7419,
1264
+ "step": 16100
1265
+ },
1266
+ {
1267
+ "epoch": 1.1814187526207363,
1268
+ "grad_norm": 0.9095293283462524,
1269
+ "learning_rate": 0.0001227286150211635,
1270
+ "loss": 0.7365,
1271
+ "step": 16200
1272
+ },
1273
+ {
1274
+ "epoch": 1.1887112358936027,
1275
+ "grad_norm": 0.8597880601882935,
1276
+ "learning_rate": 0.0001222364405945467,
1277
+ "loss": 0.7336,
1278
+ "step": 16300
1279
+ },
1280
+ {
1281
+ "epoch": 1.1960037191664692,
1282
+ "grad_norm": 0.9574359059333801,
1283
+ "learning_rate": 0.0001217442661679299,
1284
+ "loss": 0.7394,
1285
+ "step": 16400
1286
+ },
1287
+ {
1288
+ "epoch": 1.2032962024393357,
1289
+ "grad_norm": 0.8484875559806824,
1290
+ "learning_rate": 0.00012125209174131314,
1291
+ "loss": 0.7392,
1292
+ "step": 16500
1293
+ },
1294
+ {
1295
+ "epoch": 1.2105886857122021,
1296
+ "grad_norm": 0.8847618699073792,
1297
+ "learning_rate": 0.00012075991731469633,
1298
+ "loss": 0.7427,
1299
+ "step": 16600
1300
+ },
1301
+ {
1302
+ "epoch": 1.2178811689850686,
1303
+ "grad_norm": 0.8780632019042969,
1304
+ "learning_rate": 0.00012026774288807954,
1305
+ "loss": 0.7399,
1306
+ "step": 16700
1307
+ },
1308
+ {
1309
+ "epoch": 1.225173652257935,
1310
+ "grad_norm": 0.8698965311050415,
1311
+ "learning_rate": 0.00011977556846146274,
1312
+ "loss": 0.7395,
1313
+ "step": 16800
1314
+ },
1315
+ {
1316
+ "epoch": 1.2324661355308015,
1317
+ "grad_norm": 0.8717935085296631,
1318
+ "learning_rate": 0.00011928339403484596,
1319
+ "loss": 0.7404,
1320
+ "step": 16900
1321
+ },
1322
+ {
1323
+ "epoch": 1.239758618803668,
1324
+ "grad_norm": 0.8375683426856995,
1325
+ "learning_rate": 0.00011879121960822917,
1326
+ "loss": 0.7405,
1327
+ "step": 17000
1328
+ },
1329
+ {
1330
+ "epoch": 1.239758618803668,
1331
+ "eval_loss": 0.7371787428855896,
1332
+ "eval_runtime": 60.9373,
1333
+ "eval_samples_per_second": 146.954,
1334
+ "eval_steps_per_second": 18.38,
1335
+ "step": 17000
1336
+ },
1337
+ {
1338
+ "epoch": 1.2470511020765347,
1339
+ "grad_norm": 0.8756095170974731,
1340
+ "learning_rate": 0.00011829904518161237,
1341
+ "loss": 0.736,
1342
+ "step": 17100
1343
+ },
1344
+ {
1345
+ "epoch": 1.2543435853494012,
1346
+ "grad_norm": 0.8513076901435852,
1347
+ "learning_rate": 0.00011780687075499556,
1348
+ "loss": 0.7399,
1349
+ "step": 17200
1350
+ },
1351
+ {
1352
+ "epoch": 1.2616360686222676,
1353
+ "grad_norm": 0.8297843337059021,
1354
+ "learning_rate": 0.0001173146963283788,
1355
+ "loss": 0.7406,
1356
+ "step": 17300
1357
+ },
1358
+ {
1359
+ "epoch": 1.268928551895134,
1360
+ "grad_norm": 0.8896269202232361,
1361
+ "learning_rate": 0.00011682252190176199,
1362
+ "loss": 0.7346,
1363
+ "step": 17400
1364
+ },
1365
+ {
1366
+ "epoch": 1.2762210351680006,
1367
+ "grad_norm": 0.874168336391449,
1368
+ "learning_rate": 0.0001163303474751452,
1369
+ "loss": 0.736,
1370
+ "step": 17500
1371
+ },
1372
+ {
1373
+ "epoch": 1.283513518440867,
1374
+ "grad_norm": 0.9101394414901733,
1375
+ "learning_rate": 0.0001158381730485284,
1376
+ "loss": 0.7376,
1377
+ "step": 17600
1378
+ },
1379
+ {
1380
+ "epoch": 1.2908060017137335,
1381
+ "grad_norm": 0.9011333584785461,
1382
+ "learning_rate": 0.00011534599862191162,
1383
+ "loss": 0.7361,
1384
+ "step": 17700
1385
+ },
1386
+ {
1387
+ "epoch": 1.2980984849866002,
1388
+ "grad_norm": 0.8839349746704102,
1389
+ "learning_rate": 0.00011485382419529482,
1390
+ "loss": 0.7373,
1391
+ "step": 17800
1392
+ },
1393
+ {
1394
+ "epoch": 1.3053909682594664,
1395
+ "grad_norm": 0.830528974533081,
1396
+ "learning_rate": 0.00011436164976867803,
1397
+ "loss": 0.7336,
1398
+ "step": 17900
1399
+ },
1400
+ {
1401
+ "epoch": 1.3126834515323331,
1402
+ "grad_norm": 0.8777081370353699,
1403
+ "learning_rate": 0.00011386947534206122,
1404
+ "loss": 0.7379,
1405
+ "step": 18000
1406
+ },
1407
+ {
1408
+ "epoch": 1.3126834515323331,
1409
+ "eval_loss": 0.7359282970428467,
1410
+ "eval_runtime": 60.8023,
1411
+ "eval_samples_per_second": 147.281,
1412
+ "eval_steps_per_second": 18.42,
1413
+ "step": 18000
1414
+ },
1415
+ {
1416
+ "epoch": 1.3199759348051996,
1417
+ "grad_norm": 0.8853510022163391,
1418
+ "learning_rate": 0.00011337730091544443,
1419
+ "loss": 0.7376,
1420
+ "step": 18100
1421
+ },
1422
+ {
1423
+ "epoch": 1.327268418078066,
1424
+ "grad_norm": 0.9219810366630554,
1425
+ "learning_rate": 0.00011288512648882766,
1426
+ "loss": 0.7399,
1427
+ "step": 18200
1428
+ },
1429
+ {
1430
+ "epoch": 1.3345609013509325,
1431
+ "grad_norm": 0.9233282208442688,
1432
+ "learning_rate": 0.00011239295206221085,
1433
+ "loss": 0.7399,
1434
+ "step": 18300
1435
+ },
1436
+ {
1437
+ "epoch": 1.341853384623799,
1438
+ "grad_norm": 0.8359719514846802,
1439
+ "learning_rate": 0.00011190077763559406,
1440
+ "loss": 0.7366,
1441
+ "step": 18400
1442
+ },
1443
+ {
1444
+ "epoch": 1.3491458678966655,
1445
+ "grad_norm": 0.8673479557037354,
1446
+ "learning_rate": 0.00011140860320897726,
1447
+ "loss": 0.7398,
1448
+ "step": 18500
1449
+ },
1450
+ {
1451
+ "epoch": 1.356438351169532,
1452
+ "grad_norm": 0.8565610647201538,
1453
+ "learning_rate": 0.00011091642878236048,
1454
+ "loss": 0.7278,
1455
+ "step": 18600
1456
+ },
1457
+ {
1458
+ "epoch": 1.3637308344423986,
1459
+ "grad_norm": 0.8547226190567017,
1460
+ "learning_rate": 0.00011042425435574369,
1461
+ "loss": 0.7381,
1462
+ "step": 18700
1463
+ },
1464
+ {
1465
+ "epoch": 1.3710233177152649,
1466
+ "grad_norm": 0.897081732749939,
1467
+ "learning_rate": 0.00010993207992912688,
1468
+ "loss": 0.7339,
1469
+ "step": 18800
1470
+ },
1471
+ {
1472
+ "epoch": 1.3783158009881316,
1473
+ "grad_norm": 0.8852410912513733,
1474
+ "learning_rate": 0.00010943990550251008,
1475
+ "loss": 0.7342,
1476
+ "step": 18900
1477
+ },
1478
+ {
1479
+ "epoch": 1.385608284260998,
1480
+ "grad_norm": 0.9213690161705017,
1481
+ "learning_rate": 0.00010894773107589332,
1482
+ "loss": 0.7389,
1483
+ "step": 19000
1484
+ },
1485
+ {
1486
+ "epoch": 1.385608284260998,
1487
+ "eval_loss": 0.7335625886917114,
1488
+ "eval_runtime": 60.8231,
1489
+ "eval_samples_per_second": 147.23,
1490
+ "eval_steps_per_second": 18.414,
1491
+ "step": 19000
1492
+ },
1493
+ {
1494
+ "epoch": 1.3929007675338645,
1495
+ "grad_norm": 0.8398423790931702,
1496
+ "learning_rate": 0.00010845555664927651,
1497
+ "loss": 0.7274,
1498
+ "step": 19100
1499
+ },
1500
+ {
1501
+ "epoch": 1.400193250806731,
1502
+ "grad_norm": 0.8863806128501892,
1503
+ "learning_rate": 0.00010796338222265971,
1504
+ "loss": 0.7331,
1505
+ "step": 19200
1506
+ },
1507
+ {
1508
+ "epoch": 1.4074857340795974,
1509
+ "grad_norm": 0.8836521506309509,
1510
+ "learning_rate": 0.00010747120779604292,
1511
+ "loss": 0.7334,
1512
+ "step": 19300
1513
+ },
1514
+ {
1515
+ "epoch": 1.414778217352464,
1516
+ "grad_norm": 0.8278964757919312,
1517
+ "learning_rate": 0.00010697903336942614,
1518
+ "loss": 0.7281,
1519
+ "step": 19400
1520
+ },
1521
+ {
1522
+ "epoch": 1.4220707006253304,
1523
+ "grad_norm": 0.8681420087814331,
1524
+ "learning_rate": 0.00010648685894280934,
1525
+ "loss": 0.7345,
1526
+ "step": 19500
1527
+ },
1528
+ {
1529
+ "epoch": 1.429363183898197,
1530
+ "grad_norm": 0.8721694946289062,
1531
+ "learning_rate": 0.00010599468451619255,
1532
+ "loss": 0.7246,
1533
+ "step": 19600
1534
+ },
1535
+ {
1536
+ "epoch": 1.4366556671710633,
1537
+ "grad_norm": 0.8880037665367126,
1538
+ "learning_rate": 0.00010550251008957574,
1539
+ "loss": 0.7321,
1540
+ "step": 19700
1541
+ },
1542
+ {
1543
+ "epoch": 1.44394815044393,
1544
+ "grad_norm": 0.8522552251815796,
1545
+ "learning_rate": 0.00010501033566295895,
1546
+ "loss": 0.734,
1547
+ "step": 19800
1548
+ },
1549
+ {
1550
+ "epoch": 1.4512406337167965,
1551
+ "grad_norm": 0.8816943168640137,
1552
+ "learning_rate": 0.00010451816123634217,
1553
+ "loss": 0.7333,
1554
+ "step": 19900
1555
+ },
1556
+ {
1557
+ "epoch": 1.458533116989663,
1558
+ "grad_norm": 0.8068501949310303,
1559
+ "learning_rate": 0.00010402598680972537,
1560
+ "loss": 0.7267,
1561
+ "step": 20000
1562
+ },
1563
+ {
1564
+ "epoch": 1.458533116989663,
1565
+ "eval_loss": 0.731645405292511,
1566
+ "eval_runtime": 61.0998,
1567
+ "eval_samples_per_second": 146.563,
1568
+ "eval_steps_per_second": 18.331,
1569
+ "step": 20000
1570
+ },
1571
+ {
1572
+ "epoch": 1.4658256002625294,
1573
+ "grad_norm": 0.8473337888717651,
1574
+ "learning_rate": 0.00010353381238310858,
1575
+ "loss": 0.7328,
1576
+ "step": 20100
1577
+ },
1578
+ {
1579
+ "epoch": 1.4731180835353959,
1580
+ "grad_norm": 0.9009122252464294,
1581
+ "learning_rate": 0.00010304163795649177,
1582
+ "loss": 0.733,
1583
+ "step": 20200
1584
+ },
1585
+ {
1586
+ "epoch": 1.4804105668082623,
1587
+ "grad_norm": 0.8225035667419434,
1588
+ "learning_rate": 0.000102549463529875,
1589
+ "loss": 0.7311,
1590
+ "step": 20300
1591
+ },
1592
+ {
1593
+ "epoch": 1.4877030500811288,
1594
+ "grad_norm": 0.8552617430686951,
1595
+ "learning_rate": 0.00010205728910325821,
1596
+ "loss": 0.7282,
1597
+ "step": 20400
1598
+ },
1599
+ {
1600
+ "epoch": 1.4949955333539955,
1601
+ "grad_norm": 0.8690235614776611,
1602
+ "learning_rate": 0.0001015651146766414,
1603
+ "loss": 0.7329,
1604
+ "step": 20500
1605
+ },
1606
+ {
1607
+ "epoch": 1.5022880166268617,
1608
+ "grad_norm": 0.8566781878471375,
1609
+ "learning_rate": 0.0001010729402500246,
1610
+ "loss": 0.7358,
1611
+ "step": 20600
1612
+ },
1613
+ {
1614
+ "epoch": 1.5095804998997284,
1615
+ "grad_norm": 0.9174933433532715,
1616
+ "learning_rate": 0.00010058076582340782,
1617
+ "loss": 0.7266,
1618
+ "step": 20700
1619
+ },
1620
+ {
1621
+ "epoch": 1.516872983172595,
1622
+ "grad_norm": 0.9414506554603577,
1623
+ "learning_rate": 0.00010008859139679103,
1624
+ "loss": 0.7321,
1625
+ "step": 20800
1626
+ },
1627
+ {
1628
+ "epoch": 1.5241654664454614,
1629
+ "grad_norm": 0.9433586001396179,
1630
+ "learning_rate": 9.959641697017424e-05,
1631
+ "loss": 0.7355,
1632
+ "step": 20900
1633
+ },
1634
+ {
1635
+ "epoch": 1.5314579497183278,
1636
+ "grad_norm": 0.8544315695762634,
1637
+ "learning_rate": 9.910424254355744e-05,
1638
+ "loss": 0.7313,
1639
+ "step": 21000
1640
+ },
1641
+ {
1642
+ "epoch": 1.5314579497183278,
1643
+ "eval_loss": 0.7285299301147461,
1644
+ "eval_runtime": 60.6886,
1645
+ "eval_samples_per_second": 147.557,
1646
+ "eval_steps_per_second": 18.455,
1647
+ "step": 21000
1648
+ },
1649
+ {
1650
+ "epoch": 1.5387504329911943,
1651
+ "grad_norm": 0.893223762512207,
1652
+ "learning_rate": 9.861206811694065e-05,
1653
+ "loss": 0.7329,
1654
+ "step": 21100
1655
+ },
1656
+ {
1657
+ "epoch": 1.546042916264061,
1658
+ "grad_norm": 0.8868634104728699,
1659
+ "learning_rate": 9.811989369032387e-05,
1660
+ "loss": 0.7276,
1661
+ "step": 21200
1662
+ },
1663
+ {
1664
+ "epoch": 1.5533353995369272,
1665
+ "grad_norm": 0.8362566232681274,
1666
+ "learning_rate": 9.762771926370706e-05,
1667
+ "loss": 0.723,
1668
+ "step": 21300
1669
+ },
1670
+ {
1671
+ "epoch": 1.560627882809794,
1672
+ "grad_norm": 0.8852083086967468,
1673
+ "learning_rate": 9.713554483709026e-05,
1674
+ "loss": 0.7281,
1675
+ "step": 21400
1676
+ },
1677
+ {
1678
+ "epoch": 1.5679203660826602,
1679
+ "grad_norm": 0.8901813626289368,
1680
+ "learning_rate": 9.664337041047348e-05,
1681
+ "loss": 0.7307,
1682
+ "step": 21500
1683
+ },
1684
+ {
1685
+ "epoch": 1.5752128493555269,
1686
+ "grad_norm": 0.8210172057151794,
1687
+ "learning_rate": 9.615119598385667e-05,
1688
+ "loss": 0.7245,
1689
+ "step": 21600
1690
+ },
1691
+ {
1692
+ "epoch": 1.5825053326283933,
1693
+ "grad_norm": 0.8676414489746094,
1694
+ "learning_rate": 9.56590215572399e-05,
1695
+ "loss": 0.7294,
1696
+ "step": 21700
1697
+ },
1698
+ {
1699
+ "epoch": 1.5897978159012598,
1700
+ "grad_norm": 0.8923740983009338,
1701
+ "learning_rate": 9.51668471306231e-05,
1702
+ "loss": 0.7242,
1703
+ "step": 21800
1704
+ },
1705
+ {
1706
+ "epoch": 1.5970902991741263,
1707
+ "grad_norm": 0.8402920365333557,
1708
+ "learning_rate": 9.46746727040063e-05,
1709
+ "loss": 0.7258,
1710
+ "step": 21900
1711
+ },
1712
+ {
1713
+ "epoch": 1.6043827824469927,
1714
+ "grad_norm": 0.8525983691215515,
1715
+ "learning_rate": 9.418249827738951e-05,
1716
+ "loss": 0.7294,
1717
+ "step": 22000
1718
+ },
1719
+ {
1720
+ "epoch": 1.6043827824469927,
1721
+ "eval_loss": 0.7267495393753052,
1722
+ "eval_runtime": 61.1086,
1723
+ "eval_samples_per_second": 146.542,
1724
+ "eval_steps_per_second": 18.328,
1725
+ "step": 22000
1726
+ },
1727
+ {
1728
+ "epoch": 1.6116752657198594,
1729
+ "grad_norm": 0.8605002164840698,
1730
+ "learning_rate": 9.369032385077272e-05,
1731
+ "loss": 0.7259,
1732
+ "step": 22100
1733
+ },
1734
+ {
1735
+ "epoch": 1.6189677489927257,
1736
+ "grad_norm": 0.8606895208358765,
1737
+ "learning_rate": 9.319814942415592e-05,
1738
+ "loss": 0.7275,
1739
+ "step": 22200
1740
+ },
1741
+ {
1742
+ "epoch": 1.6262602322655924,
1743
+ "grad_norm": 0.8824227452278137,
1744
+ "learning_rate": 9.270597499753914e-05,
1745
+ "loss": 0.7245,
1746
+ "step": 22300
1747
+ },
1748
+ {
1749
+ "epoch": 1.6335527155384586,
1750
+ "grad_norm": 0.8670118451118469,
1751
+ "learning_rate": 9.221380057092233e-05,
1752
+ "loss": 0.719,
1753
+ "step": 22400
1754
+ },
1755
+ {
1756
+ "epoch": 1.6408451988113253,
1757
+ "grad_norm": 0.92063307762146,
1758
+ "learning_rate": 9.172162614430555e-05,
1759
+ "loss": 0.7293,
1760
+ "step": 22500
1761
+ },
1762
+ {
1763
+ "epoch": 1.6481376820841918,
1764
+ "grad_norm": 0.8425260782241821,
1765
+ "learning_rate": 9.122945171768876e-05,
1766
+ "loss": 0.728,
1767
+ "step": 22600
1768
+ },
1769
+ {
1770
+ "epoch": 1.6554301653570582,
1771
+ "grad_norm": 0.9162302017211914,
1772
+ "learning_rate": 9.073727729107196e-05,
1773
+ "loss": 0.7265,
1774
+ "step": 22700
1775
+ },
1776
+ {
1777
+ "epoch": 1.6627226486299247,
1778
+ "grad_norm": 0.8905067443847656,
1779
+ "learning_rate": 9.024510286445517e-05,
1780
+ "loss": 0.7256,
1781
+ "step": 22800
1782
+ },
1783
+ {
1784
+ "epoch": 1.6700151319027912,
1785
+ "grad_norm": 0.874357283115387,
1786
+ "learning_rate": 8.975292843783837e-05,
1787
+ "loss": 0.7249,
1788
+ "step": 22900
1789
+ },
1790
+ {
1791
+ "epoch": 1.6773076151756579,
1792
+ "grad_norm": 0.842005729675293,
1793
+ "learning_rate": 8.926075401122158e-05,
1794
+ "loss": 0.7268,
1795
+ "step": 23000
1796
+ },
1797
+ {
1798
+ "epoch": 1.6773076151756579,
1799
+ "eval_loss": 0.7241798639297485,
1800
+ "eval_runtime": 60.7958,
1801
+ "eval_samples_per_second": 147.296,
1802
+ "eval_steps_per_second": 18.422,
1803
+ "step": 23000
1804
+ },
1805
+ {
1806
+ "epoch": 1.684600098448524,
1807
+ "grad_norm": 0.8695193529129028,
1808
+ "learning_rate": 8.876857958460478e-05,
1809
+ "loss": 0.7262,
1810
+ "step": 23100
1811
+ },
1812
+ {
1813
+ "epoch": 1.6918925817213908,
1814
+ "grad_norm": 0.8673058748245239,
1815
+ "learning_rate": 8.827640515798799e-05,
1816
+ "loss": 0.7303,
1817
+ "step": 23200
1818
+ },
1819
+ {
1820
+ "epoch": 1.699185064994257,
1821
+ "grad_norm": 0.9276596307754517,
1822
+ "learning_rate": 8.77842307313712e-05,
1823
+ "loss": 0.729,
1824
+ "step": 23300
1825
+ },
1826
+ {
1827
+ "epoch": 1.7064775482671237,
1828
+ "grad_norm": 0.8023722171783447,
1829
+ "learning_rate": 8.729205630475441e-05,
1830
+ "loss": 0.7212,
1831
+ "step": 23400
1832
+ },
1833
+ {
1834
+ "epoch": 1.7137700315399902,
1835
+ "grad_norm": 0.910897433757782,
1836
+ "learning_rate": 8.67998818781376e-05,
1837
+ "loss": 0.7252,
1838
+ "step": 23500
1839
+ },
1840
+ {
1841
+ "epoch": 1.7210625148128567,
1842
+ "grad_norm": 0.8714926838874817,
1843
+ "learning_rate": 8.630770745152083e-05,
1844
+ "loss": 0.7306,
1845
+ "step": 23600
1846
+ },
1847
+ {
1848
+ "epoch": 1.7283549980857231,
1849
+ "grad_norm": 0.8875166773796082,
1850
+ "learning_rate": 8.581553302490403e-05,
1851
+ "loss": 0.7235,
1852
+ "step": 23700
1853
+ },
1854
+ {
1855
+ "epoch": 1.7356474813585896,
1856
+ "grad_norm": 0.9132345914840698,
1857
+ "learning_rate": 8.532335859828724e-05,
1858
+ "loss": 0.7331,
1859
+ "step": 23800
1860
+ },
1861
+ {
1862
+ "epoch": 1.7429399646314563,
1863
+ "grad_norm": 0.8562710285186768,
1864
+ "learning_rate": 8.483118417167044e-05,
1865
+ "loss": 0.7282,
1866
+ "step": 23900
1867
+ },
1868
+ {
1869
+ "epoch": 1.7502324479043225,
1870
+ "grad_norm": 0.867508590221405,
1871
+ "learning_rate": 8.433900974505365e-05,
1872
+ "loss": 0.7256,
1873
+ "step": 24000
1874
+ },
1875
+ {
1876
+ "epoch": 1.7502324479043225,
1877
+ "eval_loss": 0.7232645153999329,
1878
+ "eval_runtime": 60.377,
1879
+ "eval_samples_per_second": 148.318,
1880
+ "eval_steps_per_second": 18.55,
1881
+ "step": 24000
1882
+ },
1883
+ {
1884
+ "epoch": 1.7575249311771892,
1885
+ "grad_norm": 0.8258200287818909,
1886
+ "learning_rate": 8.384683531843685e-05,
1887
+ "loss": 0.7254,
1888
+ "step": 24100
1889
+ },
1890
+ {
1891
+ "epoch": 1.7648174144500555,
1892
+ "grad_norm": 0.9109018445014954,
1893
+ "learning_rate": 8.335466089182007e-05,
1894
+ "loss": 0.7315,
1895
+ "step": 24200
1896
+ },
1897
+ {
1898
+ "epoch": 1.7721098977229222,
1899
+ "grad_norm": 0.8500842452049255,
1900
+ "learning_rate": 8.286248646520326e-05,
1901
+ "loss": 0.7265,
1902
+ "step": 24300
1903
+ },
1904
+ {
1905
+ "epoch": 1.7794023809957886,
1906
+ "grad_norm": 0.9286713600158691,
1907
+ "learning_rate": 8.237031203858648e-05,
1908
+ "loss": 0.7247,
1909
+ "step": 24400
1910
+ },
1911
+ {
1912
+ "epoch": 1.786694864268655,
1913
+ "grad_norm": 0.8746926188468933,
1914
+ "learning_rate": 8.187813761196969e-05,
1915
+ "loss": 0.7261,
1916
+ "step": 24500
1917
+ },
1918
+ {
1919
+ "epoch": 1.7939873475415216,
1920
+ "grad_norm": 0.8702288866043091,
1921
+ "learning_rate": 8.13859631853529e-05,
1922
+ "loss": 0.7207,
1923
+ "step": 24600
1924
+ },
1925
+ {
1926
+ "epoch": 1.801279830814388,
1927
+ "grad_norm": 0.9746344089508057,
1928
+ "learning_rate": 8.08937887587361e-05,
1929
+ "loss": 0.728,
1930
+ "step": 24700
1931
+ },
1932
+ {
1933
+ "epoch": 1.8085723140872547,
1934
+ "grad_norm": 0.8815904259681702,
1935
+ "learning_rate": 8.04016143321193e-05,
1936
+ "loss": 0.7174,
1937
+ "step": 24800
1938
+ },
1939
+ {
1940
+ "epoch": 1.815864797360121,
1941
+ "grad_norm": 0.870474100112915,
1942
+ "learning_rate": 7.990943990550251e-05,
1943
+ "loss": 0.7316,
1944
+ "step": 24900
1945
+ },
1946
+ {
1947
+ "epoch": 1.8231572806329877,
1948
+ "grad_norm": 0.8451401591300964,
1949
+ "learning_rate": 7.941726547888572e-05,
1950
+ "loss": 0.7202,
1951
+ "step": 25000
1952
+ },
1953
+ {
1954
+ "epoch": 1.8231572806329877,
1955
+ "eval_loss": 0.721147358417511,
1956
+ "eval_runtime": 60.8906,
1957
+ "eval_samples_per_second": 147.067,
1958
+ "eval_steps_per_second": 18.394,
1959
+ "step": 25000
1960
+ },
1961
+ {
1962
+ "epoch": 1.830449763905854,
1963
+ "grad_norm": 0.8878180980682373,
1964
+ "learning_rate": 7.892509105226894e-05,
1965
+ "loss": 0.7236,
1966
+ "step": 25100
1967
+ },
1968
+ {
1969
+ "epoch": 1.8377422471787206,
1970
+ "grad_norm": 0.859920859336853,
1971
+ "learning_rate": 7.843291662565213e-05,
1972
+ "loss": 0.7257,
1973
+ "step": 25200
1974
+ },
1975
+ {
1976
+ "epoch": 1.845034730451587,
1977
+ "grad_norm": 0.9358228445053101,
1978
+ "learning_rate": 7.794074219903535e-05,
1979
+ "loss": 0.7175,
1980
+ "step": 25300
1981
+ },
1982
+ {
1983
+ "epoch": 1.8523272137244535,
1984
+ "grad_norm": 0.858906626701355,
1985
+ "learning_rate": 7.744856777241854e-05,
1986
+ "loss": 0.7217,
1987
+ "step": 25400
1988
+ },
1989
+ {
1990
+ "epoch": 1.85961969699732,
1991
+ "grad_norm": 0.9508287310600281,
1992
+ "learning_rate": 7.695639334580176e-05,
1993
+ "loss": 0.7211,
1994
+ "step": 25500
1995
+ },
1996
+ {
1997
+ "epoch": 1.8669121802701865,
1998
+ "grad_norm": 0.9340062141418457,
1999
+ "learning_rate": 7.646421891918496e-05,
2000
+ "loss": 0.7254,
2001
+ "step": 25600
2002
+ },
2003
+ {
2004
+ "epoch": 1.8742046635430532,
2005
+ "grad_norm": 0.9350687861442566,
2006
+ "learning_rate": 7.597204449256817e-05,
2007
+ "loss": 0.7247,
2008
+ "step": 25700
2009
+ },
2010
+ {
2011
+ "epoch": 1.8814971468159194,
2012
+ "grad_norm": 0.9614841938018799,
2013
+ "learning_rate": 7.547987006595137e-05,
2014
+ "loss": 0.7283,
2015
+ "step": 25800
2016
+ },
2017
+ {
2018
+ "epoch": 1.888789630088786,
2019
+ "grad_norm": 0.848640501499176,
2020
+ "learning_rate": 7.49876956393346e-05,
2021
+ "loss": 0.7221,
2022
+ "step": 25900
2023
+ },
2024
+ {
2025
+ "epoch": 1.8960821133616523,
2026
+ "grad_norm": 0.8105534315109253,
2027
+ "learning_rate": 7.449552121271779e-05,
2028
+ "loss": 0.7205,
2029
+ "step": 26000
2030
+ },
2031
+ {
2032
+ "epoch": 1.8960821133616523,
2033
+ "eval_loss": 0.7193262577056885,
2034
+ "eval_runtime": 61.1614,
2035
+ "eval_samples_per_second": 146.416,
2036
+ "eval_steps_per_second": 18.312,
2037
+ "step": 26000
2038
+ },
2039
+ {
2040
+ "epoch": 1.903374596634519,
2041
+ "grad_norm": 0.8522207736968994,
2042
+ "learning_rate": 7.4003346786101e-05,
2043
+ "loss": 0.7223,
2044
+ "step": 26100
2045
+ },
2046
+ {
2047
+ "epoch": 1.9106670799073855,
2048
+ "grad_norm": 0.8983740210533142,
2049
+ "learning_rate": 7.351117235948421e-05,
2050
+ "loss": 0.7208,
2051
+ "step": 26200
2052
+ },
2053
+ {
2054
+ "epoch": 1.917959563180252,
2055
+ "grad_norm": 0.8596473336219788,
2056
+ "learning_rate": 7.301899793286742e-05,
2057
+ "loss": 0.7184,
2058
+ "step": 26300
2059
+ },
2060
+ {
2061
+ "epoch": 1.9252520464531184,
2062
+ "grad_norm": 0.9175098538398743,
2063
+ "learning_rate": 7.252682350625062e-05,
2064
+ "loss": 0.7213,
2065
+ "step": 26400
2066
+ },
2067
+ {
2068
+ "epoch": 1.932544529725985,
2069
+ "grad_norm": 0.8626872897148132,
2070
+ "learning_rate": 7.203464907963383e-05,
2071
+ "loss": 0.7242,
2072
+ "step": 26500
2073
+ },
2074
+ {
2075
+ "epoch": 1.9398370129988516,
2076
+ "grad_norm": 0.859780490398407,
2077
+ "learning_rate": 7.154247465301703e-05,
2078
+ "loss": 0.7197,
2079
+ "step": 26600
2080
+ },
2081
+ {
2082
+ "epoch": 1.9471294962717178,
2083
+ "grad_norm": 0.8713703751564026,
2084
+ "learning_rate": 7.105030022640024e-05,
2085
+ "loss": 0.7231,
2086
+ "step": 26700
2087
+ },
2088
+ {
2089
+ "epoch": 1.9544219795445845,
2090
+ "grad_norm": 0.8976535797119141,
2091
+ "learning_rate": 7.055812579978344e-05,
2092
+ "loss": 0.7233,
2093
+ "step": 26800
2094
+ },
2095
+ {
2096
+ "epoch": 1.9617144628174508,
2097
+ "grad_norm": 0.9257802367210388,
2098
+ "learning_rate": 7.006595137316665e-05,
2099
+ "loss": 0.7221,
2100
+ "step": 26900
2101
+ },
2102
+ {
2103
+ "epoch": 1.9690069460903175,
2104
+ "grad_norm": 0.8592785596847534,
2105
+ "learning_rate": 6.957377694654987e-05,
2106
+ "loss": 0.7168,
2107
+ "step": 27000
2108
+ },
2109
+ {
2110
+ "epoch": 1.9690069460903175,
2111
+ "eval_loss": 0.7180259227752686,
2112
+ "eval_runtime": 60.5352,
2113
+ "eval_samples_per_second": 147.931,
2114
+ "eval_steps_per_second": 18.502,
2115
+ "step": 27000
2116
+ },
2117
+ {
2118
+ "epoch": 1.976299429363184,
2119
+ "grad_norm": 0.8931472897529602,
2120
+ "learning_rate": 6.908160251993306e-05,
2121
+ "loss": 0.7204,
2122
+ "step": 27100
2123
+ },
2124
+ {
2125
+ "epoch": 1.9835919126360504,
2126
+ "grad_norm": 0.8821597695350647,
2127
+ "learning_rate": 6.858942809331628e-05,
2128
+ "loss": 0.7163,
2129
+ "step": 27200
2130
+ },
2131
+ {
2132
+ "epoch": 1.9908843959089169,
2133
+ "grad_norm": 0.8749621510505676,
2134
+ "learning_rate": 6.809725366669948e-05,
2135
+ "loss": 0.711,
2136
+ "step": 27300
2137
+ },
2138
+ {
2139
+ "epoch": 1.9981768791817833,
2140
+ "grad_norm": 0.903332531452179,
2141
+ "learning_rate": 6.760507924008269e-05,
2142
+ "loss": 0.7176,
2143
+ "step": 27400
2144
+ },
2145
+ {
2146
+ "epoch": 2.005505824871014,
2147
+ "grad_norm": 0.854773759841919,
2148
+ "learning_rate": 6.71129048134659e-05,
2149
+ "loss": 0.7187,
2150
+ "step": 27500
2151
+ },
2152
+ {
2153
+ "epoch": 2.0127983081438807,
2154
+ "grad_norm": 0.9489893913269043,
2155
+ "learning_rate": 6.66207303868491e-05,
2156
+ "loss": 0.7096,
2157
+ "step": 27600
2158
+ },
2159
+ {
2160
+ "epoch": 2.020090791416747,
2161
+ "grad_norm": 0.8944621682167053,
2162
+ "learning_rate": 6.61285559602323e-05,
2163
+ "loss": 0.7104,
2164
+ "step": 27700
2165
+ },
2166
+ {
2167
+ "epoch": 2.0273832746896137,
2168
+ "grad_norm": 0.8567011952400208,
2169
+ "learning_rate": 6.563638153361553e-05,
2170
+ "loss": 0.7124,
2171
+ "step": 27800
2172
+ },
2173
+ {
2174
+ "epoch": 2.0346757579624803,
2175
+ "grad_norm": 0.8737155199050903,
2176
+ "learning_rate": 6.514420710699872e-05,
2177
+ "loss": 0.7127,
2178
+ "step": 27900
2179
+ },
2180
+ {
2181
+ "epoch": 2.0419682412353466,
2182
+ "grad_norm": 0.8935887813568115,
2183
+ "learning_rate": 6.465203268038194e-05,
2184
+ "loss": 0.7122,
2185
+ "step": 28000
2186
+ },
2187
+ {
2188
+ "epoch": 2.0419682412353466,
2189
+ "eval_loss": 0.716705858707428,
2190
+ "eval_runtime": 60.7739,
2191
+ "eval_samples_per_second": 147.349,
2192
+ "eval_steps_per_second": 18.429,
2193
+ "step": 28000
2194
+ },
2195
+ {
2196
+ "epoch": 2.0492607245082133,
2197
+ "grad_norm": 0.9452987313270569,
2198
+ "learning_rate": 6.415985825376514e-05,
2199
+ "loss": 0.7112,
2200
+ "step": 28100
2201
+ },
2202
+ {
2203
+ "epoch": 2.0565532077810795,
2204
+ "grad_norm": 0.8650675415992737,
2205
+ "learning_rate": 6.366768382714833e-05,
2206
+ "loss": 0.7079,
2207
+ "step": 28200
2208
+ },
2209
+ {
2210
+ "epoch": 2.063845691053946,
2211
+ "grad_norm": 0.8913034796714783,
2212
+ "learning_rate": 6.317550940053155e-05,
2213
+ "loss": 0.713,
2214
+ "step": 28300
2215
+ },
2216
+ {
2217
+ "epoch": 2.0711381743268125,
2218
+ "grad_norm": 0.9072710275650024,
2219
+ "learning_rate": 6.268333497391476e-05,
2220
+ "loss": 0.7094,
2221
+ "step": 28400
2222
+ },
2223
+ {
2224
+ "epoch": 2.078430657599679,
2225
+ "grad_norm": 0.854245126247406,
2226
+ "learning_rate": 6.219116054729796e-05,
2227
+ "loss": 0.7077,
2228
+ "step": 28500
2229
+ },
2230
+ {
2231
+ "epoch": 2.0857231408725454,
2232
+ "grad_norm": 0.929263174533844,
2233
+ "learning_rate": 6.169898612068117e-05,
2234
+ "loss": 0.7086,
2235
+ "step": 28600
2236
+ },
2237
+ {
2238
+ "epoch": 2.093015624145412,
2239
+ "grad_norm": 0.9356215596199036,
2240
+ "learning_rate": 6.120681169406438e-05,
2241
+ "loss": 0.7157,
2242
+ "step": 28700
2243
+ },
2244
+ {
2245
+ "epoch": 2.100308107418279,
2246
+ "grad_norm": 0.9242870211601257,
2247
+ "learning_rate": 6.071463726744758e-05,
2248
+ "loss": 0.71,
2249
+ "step": 28800
2250
+ },
2251
+ {
2252
+ "epoch": 2.107600590691145,
2253
+ "grad_norm": 0.9065095782279968,
2254
+ "learning_rate": 6.022246284083079e-05,
2255
+ "loss": 0.7095,
2256
+ "step": 28900
2257
+ },
2258
+ {
2259
+ "epoch": 2.1148930739640117,
2260
+ "grad_norm": 0.9081276059150696,
2261
+ "learning_rate": 5.9730288414214e-05,
2262
+ "loss": 0.7096,
2263
+ "step": 29000
2264
+ },
2265
+ {
2266
+ "epoch": 2.1148930739640117,
2267
+ "eval_loss": 0.7152244448661804,
2268
+ "eval_runtime": 60.7986,
2269
+ "eval_samples_per_second": 147.29,
2270
+ "eval_steps_per_second": 18.421,
2271
+ "step": 29000
2272
+ },
2273
+ {
2274
+ "epoch": 2.122185557236878,
2275
+ "grad_norm": 0.8326215744018555,
2276
+ "learning_rate": 5.923811398759721e-05,
2277
+ "loss": 0.7147,
2278
+ "step": 29100
2279
+ },
2280
+ {
2281
+ "epoch": 2.1294780405097447,
2282
+ "grad_norm": 0.9274723529815674,
2283
+ "learning_rate": 5.874593956098041e-05,
2284
+ "loss": 0.7111,
2285
+ "step": 29200
2286
+ },
2287
+ {
2288
+ "epoch": 2.136770523782611,
2289
+ "grad_norm": 0.8282331824302673,
2290
+ "learning_rate": 5.825376513436362e-05,
2291
+ "loss": 0.7137,
2292
+ "step": 29300
2293
+ },
2294
+ {
2295
+ "epoch": 2.1440630070554776,
2296
+ "grad_norm": 0.9081612229347229,
2297
+ "learning_rate": 5.776159070774683e-05,
2298
+ "loss": 0.7115,
2299
+ "step": 29400
2300
+ },
2301
+ {
2302
+ "epoch": 2.151355490328344,
2303
+ "grad_norm": 0.9531508684158325,
2304
+ "learning_rate": 5.726941628113004e-05,
2305
+ "loss": 0.708,
2306
+ "step": 29500
2307
+ },
2308
+ {
2309
+ "epoch": 2.1586479736012105,
2310
+ "grad_norm": 0.9125275611877441,
2311
+ "learning_rate": 5.677724185451324e-05,
2312
+ "loss": 0.7123,
2313
+ "step": 29600
2314
+ },
2315
+ {
2316
+ "epoch": 2.165940456874077,
2317
+ "grad_norm": 0.9363859295845032,
2318
+ "learning_rate": 5.628506742789645e-05,
2319
+ "loss": 0.7146,
2320
+ "step": 29700
2321
+ },
2322
+ {
2323
+ "epoch": 2.1732329401469435,
2324
+ "grad_norm": 0.9164854884147644,
2325
+ "learning_rate": 5.579289300127966e-05,
2326
+ "loss": 0.7121,
2327
+ "step": 29800
2328
+ },
2329
+ {
2330
+ "epoch": 2.18052542341981,
2331
+ "grad_norm": 0.941330075263977,
2332
+ "learning_rate": 5.530071857466287e-05,
2333
+ "loss": 0.7086,
2334
+ "step": 29900
2335
+ },
2336
+ {
2337
+ "epoch": 2.1878179066926764,
2338
+ "grad_norm": 0.9006567597389221,
2339
+ "learning_rate": 5.480854414804607e-05,
2340
+ "loss": 0.7097,
2341
+ "step": 30000
2342
+ },
2343
+ {
2344
+ "epoch": 2.1878179066926764,
2345
+ "eval_loss": 0.7143043875694275,
2346
+ "eval_runtime": 61.0555,
2347
+ "eval_samples_per_second": 146.67,
2348
+ "eval_steps_per_second": 18.344,
2349
+ "step": 30000
2350
+ },
2351
+ {
2352
+ "epoch": 2.195110389965543,
2353
+ "grad_norm": 0.8913944363594055,
2354
+ "learning_rate": 5.431636972142927e-05,
2355
+ "loss": 0.7066,
2356
+ "step": 30100
2357
+ },
2358
+ {
2359
+ "epoch": 2.2024028732384093,
2360
+ "grad_norm": 0.9200546145439148,
2361
+ "learning_rate": 5.3824195294812486e-05,
2362
+ "loss": 0.7076,
2363
+ "step": 30200
2364
+ },
2365
+ {
2366
+ "epoch": 2.209695356511276,
2367
+ "grad_norm": 0.924148440361023,
2368
+ "learning_rate": 5.3332020868195684e-05,
2369
+ "loss": 0.7058,
2370
+ "step": 30300
2371
+ },
2372
+ {
2373
+ "epoch": 2.2169878397841423,
2374
+ "grad_norm": 0.922255277633667,
2375
+ "learning_rate": 5.2839846441578897e-05,
2376
+ "loss": 0.7108,
2377
+ "step": 30400
2378
+ },
2379
+ {
2380
+ "epoch": 2.224280323057009,
2381
+ "grad_norm": 0.9039818644523621,
2382
+ "learning_rate": 5.23476720149621e-05,
2383
+ "loss": 0.7091,
2384
+ "step": 30500
2385
+ },
2386
+ {
2387
+ "epoch": 2.2315728063298756,
2388
+ "grad_norm": 0.963845431804657,
2389
+ "learning_rate": 5.1855497588345314e-05,
2390
+ "loss": 0.7065,
2391
+ "step": 30600
2392
+ },
2393
+ {
2394
+ "epoch": 2.238865289602742,
2395
+ "grad_norm": 0.8838880658149719,
2396
+ "learning_rate": 5.136332316172851e-05,
2397
+ "loss": 0.7113,
2398
+ "step": 30700
2399
+ },
2400
+ {
2401
+ "epoch": 2.2461577728756086,
2402
+ "grad_norm": 0.9642555117607117,
2403
+ "learning_rate": 5.0871148735111725e-05,
2404
+ "loss": 0.7062,
2405
+ "step": 30800
2406
+ },
2407
+ {
2408
+ "epoch": 2.253450256148475,
2409
+ "grad_norm": 0.9088276624679565,
2410
+ "learning_rate": 5.037897430849493e-05,
2411
+ "loss": 0.7071,
2412
+ "step": 30900
2413
+ },
2414
+ {
2415
+ "epoch": 2.2607427394213415,
2416
+ "grad_norm": 0.9083282351493835,
2417
+ "learning_rate": 4.9886799881878137e-05,
2418
+ "loss": 0.7126,
2419
+ "step": 31000
2420
+ },
2421
+ {
2422
+ "epoch": 2.2607427394213415,
2423
+ "eval_loss": 0.7129958868026733,
2424
+ "eval_runtime": 60.7821,
2425
+ "eval_samples_per_second": 147.33,
2426
+ "eval_steps_per_second": 18.426,
2427
+ "step": 31000
2428
+ },
2429
+ {
2430
+ "epoch": 2.2680352226942078,
2431
+ "grad_norm": 0.886710524559021,
2432
+ "learning_rate": 4.939462545526134e-05,
2433
+ "loss": 0.7043,
2434
+ "step": 31100
2435
+ },
2436
+ {
2437
+ "epoch": 2.2753277059670745,
2438
+ "grad_norm": 0.8600069880485535,
2439
+ "learning_rate": 4.8902451028644554e-05,
2440
+ "loss": 0.7074,
2441
+ "step": 31200
2442
+ },
2443
+ {
2444
+ "epoch": 2.2826201892399407,
2445
+ "grad_norm": 0.8897703289985657,
2446
+ "learning_rate": 4.841027660202776e-05,
2447
+ "loss": 0.7068,
2448
+ "step": 31300
2449
+ },
2450
+ {
2451
+ "epoch": 2.2899126725128074,
2452
+ "grad_norm": 0.8638718724250793,
2453
+ "learning_rate": 4.7918102175410965e-05,
2454
+ "loss": 0.7062,
2455
+ "step": 31400
2456
+ },
2457
+ {
2458
+ "epoch": 2.297205155785674,
2459
+ "grad_norm": 0.8973529934883118,
2460
+ "learning_rate": 4.742592774879418e-05,
2461
+ "loss": 0.7073,
2462
+ "step": 31500
2463
+ },
2464
+ {
2465
+ "epoch": 2.3044976390585403,
2466
+ "grad_norm": 0.9759765267372131,
2467
+ "learning_rate": 4.693375332217738e-05,
2468
+ "loss": 0.7087,
2469
+ "step": 31600
2470
+ },
2471
+ {
2472
+ "epoch": 2.311790122331407,
2473
+ "grad_norm": 0.9061428904533386,
2474
+ "learning_rate": 4.644157889556059e-05,
2475
+ "loss": 0.708,
2476
+ "step": 31700
2477
+ },
2478
+ {
2479
+ "epoch": 2.3190826056042733,
2480
+ "grad_norm": 0.8808257579803467,
2481
+ "learning_rate": 4.5949404468943794e-05,
2482
+ "loss": 0.7086,
2483
+ "step": 31800
2484
+ },
2485
+ {
2486
+ "epoch": 2.32637508887714,
2487
+ "grad_norm": 0.9116071462631226,
2488
+ "learning_rate": 4.545723004232701e-05,
2489
+ "loss": 0.7118,
2490
+ "step": 31900
2491
+ },
2492
+ {
2493
+ "epoch": 2.333667572150006,
2494
+ "grad_norm": 0.9131873846054077,
2495
+ "learning_rate": 4.496505561571021e-05,
2496
+ "loss": 0.7043,
2497
+ "step": 32000
2498
+ },
2499
+ {
2500
+ "epoch": 2.333667572150006,
2501
+ "eval_loss": 0.7112506031990051,
2502
+ "eval_runtime": 61.1535,
2503
+ "eval_samples_per_second": 146.435,
2504
+ "eval_steps_per_second": 18.315,
2505
+ "step": 32000
2506
+ },
2507
+ {
2508
+ "epoch": 2.340960055422873,
2509
+ "grad_norm": 0.9860331416130066,
2510
+ "learning_rate": 4.447288118909342e-05,
2511
+ "loss": 0.7063,
2512
+ "step": 32100
2513
+ },
2514
+ {
2515
+ "epoch": 2.348252538695739,
2516
+ "grad_norm": 0.933958888053894,
2517
+ "learning_rate": 4.398070676247662e-05,
2518
+ "loss": 0.708,
2519
+ "step": 32200
2520
+ },
2521
+ {
2522
+ "epoch": 2.355545021968606,
2523
+ "grad_norm": 0.8994225859642029,
2524
+ "learning_rate": 4.3488532335859836e-05,
2525
+ "loss": 0.7089,
2526
+ "step": 32300
2527
+ },
2528
+ {
2529
+ "epoch": 2.3628375052414725,
2530
+ "grad_norm": 0.9435915946960449,
2531
+ "learning_rate": 4.299635790924304e-05,
2532
+ "loss": 0.7057,
2533
+ "step": 32400
2534
+ },
2535
+ {
2536
+ "epoch": 2.3701299885143388,
2537
+ "grad_norm": 0.888438880443573,
2538
+ "learning_rate": 4.2504183482626247e-05,
2539
+ "loss": 0.7012,
2540
+ "step": 32500
2541
+ },
2542
+ {
2543
+ "epoch": 2.3774224717872054,
2544
+ "grad_norm": 0.8772885799407959,
2545
+ "learning_rate": 4.201200905600945e-05,
2546
+ "loss": 0.7071,
2547
+ "step": 32600
2548
+ },
2549
+ {
2550
+ "epoch": 2.3847149550600717,
2551
+ "grad_norm": 0.9333481788635254,
2552
+ "learning_rate": 4.151983462939266e-05,
2553
+ "loss": 0.7095,
2554
+ "step": 32700
2555
+ },
2556
+ {
2557
+ "epoch": 2.3920074383329384,
2558
+ "grad_norm": 0.9497707486152649,
2559
+ "learning_rate": 4.102766020277586e-05,
2560
+ "loss": 0.7115,
2561
+ "step": 32800
2562
+ },
2563
+ {
2564
+ "epoch": 2.3992999216058046,
2565
+ "grad_norm": 0.9641472697257996,
2566
+ "learning_rate": 4.053548577615907e-05,
2567
+ "loss": 0.712,
2568
+ "step": 32900
2569
+ },
2570
+ {
2571
+ "epoch": 2.4065924048786713,
2572
+ "grad_norm": 0.8958153128623962,
2573
+ "learning_rate": 4.004331134954228e-05,
2574
+ "loss": 0.7035,
2575
+ "step": 33000
2576
+ },
2577
+ {
2578
+ "epoch": 2.4065924048786713,
2579
+ "eval_loss": 0.7100856304168701,
2580
+ "eval_runtime": 61.2325,
2581
+ "eval_samples_per_second": 146.246,
2582
+ "eval_steps_per_second": 18.291,
2583
+ "step": 33000
2584
+ },
2585
+ {
2586
+ "epoch": 2.4138848881515376,
2587
+ "grad_norm": 0.8818393349647522,
2588
+ "learning_rate": 3.9551136922925487e-05,
2589
+ "loss": 0.7052,
2590
+ "step": 33100
2591
+ },
2592
+ {
2593
+ "epoch": 2.4211773714244043,
2594
+ "grad_norm": 0.8973012566566467,
2595
+ "learning_rate": 3.905896249630869e-05,
2596
+ "loss": 0.706,
2597
+ "step": 33200
2598
+ },
2599
+ {
2600
+ "epoch": 2.428469854697271,
2601
+ "grad_norm": 0.8582873344421387,
2602
+ "learning_rate": 3.85667880696919e-05,
2603
+ "loss": 0.7088,
2604
+ "step": 33300
2605
+ },
2606
+ {
2607
+ "epoch": 2.435762337970137,
2608
+ "grad_norm": 0.9306252002716064,
2609
+ "learning_rate": 3.807461364307511e-05,
2610
+ "loss": 0.7062,
2611
+ "step": 33400
2612
+ },
2613
+ {
2614
+ "epoch": 2.443054821243004,
2615
+ "grad_norm": 0.8586992025375366,
2616
+ "learning_rate": 3.7582439216458315e-05,
2617
+ "loss": 0.7086,
2618
+ "step": 33500
2619
+ },
2620
+ {
2621
+ "epoch": 2.45034730451587,
2622
+ "grad_norm": 0.9076369404792786,
2623
+ "learning_rate": 3.709026478984152e-05,
2624
+ "loss": 0.7052,
2625
+ "step": 33600
2626
+ },
2627
+ {
2628
+ "epoch": 2.457639787788737,
2629
+ "grad_norm": 0.8954334855079651,
2630
+ "learning_rate": 3.6598090363224727e-05,
2631
+ "loss": 0.7082,
2632
+ "step": 33700
2633
+ },
2634
+ {
2635
+ "epoch": 2.464932271061603,
2636
+ "grad_norm": 0.9315345287322998,
2637
+ "learning_rate": 3.610591593660794e-05,
2638
+ "loss": 0.7058,
2639
+ "step": 33800
2640
+ },
2641
+ {
2642
+ "epoch": 2.4722247543344698,
2643
+ "grad_norm": 0.9223620295524597,
2644
+ "learning_rate": 3.5613741509991144e-05,
2645
+ "loss": 0.6992,
2646
+ "step": 33900
2647
+ },
2648
+ {
2649
+ "epoch": 2.479517237607336,
2650
+ "grad_norm": 0.9349290132522583,
2651
+ "learning_rate": 3.512156708337435e-05,
2652
+ "loss": 0.7084,
2653
+ "step": 34000
2654
+ },
2655
+ {
2656
+ "epoch": 2.479517237607336,
2657
+ "eval_loss": 0.7087690234184265,
2658
+ "eval_runtime": 60.8859,
2659
+ "eval_samples_per_second": 147.078,
2660
+ "eval_steps_per_second": 18.395,
2661
+ "step": 34000
2662
+ },
2663
+ {
2664
+ "epoch": 2.4868097208802027,
2665
+ "grad_norm": 0.883210301399231,
2666
+ "learning_rate": 3.462939265675756e-05,
2667
+ "loss": 0.7061,
2668
+ "step": 34100
2669
+ },
2670
+ {
2671
+ "epoch": 2.4941022041530694,
2672
+ "grad_norm": 0.920868456363678,
2673
+ "learning_rate": 3.413721823014077e-05,
2674
+ "loss": 0.7069,
2675
+ "step": 34200
2676
+ },
2677
+ {
2678
+ "epoch": 2.5013946874259356,
2679
+ "grad_norm": 0.9177393913269043,
2680
+ "learning_rate": 3.3645043803523966e-05,
2681
+ "loss": 0.7071,
2682
+ "step": 34300
2683
+ },
2684
+ {
2685
+ "epoch": 2.5086871706988023,
2686
+ "grad_norm": 0.9114101529121399,
2687
+ "learning_rate": 3.315286937690717e-05,
2688
+ "loss": 0.7072,
2689
+ "step": 34400
2690
+ },
2691
+ {
2692
+ "epoch": 2.5159796539716686,
2693
+ "grad_norm": 0.9645174145698547,
2694
+ "learning_rate": 3.2660694950290384e-05,
2695
+ "loss": 0.7028,
2696
+ "step": 34500
2697
+ },
2698
+ {
2699
+ "epoch": 2.5232721372445353,
2700
+ "grad_norm": 0.8982295989990234,
2701
+ "learning_rate": 3.216852052367359e-05,
2702
+ "loss": 0.7085,
2703
+ "step": 34600
2704
+ },
2705
+ {
2706
+ "epoch": 2.530564620517402,
2707
+ "grad_norm": 0.8964338898658752,
2708
+ "learning_rate": 3.1676346097056795e-05,
2709
+ "loss": 0.7069,
2710
+ "step": 34700
2711
+ },
2712
+ {
2713
+ "epoch": 2.537857103790268,
2714
+ "grad_norm": 0.9609666466712952,
2715
+ "learning_rate": 3.118417167044001e-05,
2716
+ "loss": 0.7057,
2717
+ "step": 34800
2718
+ },
2719
+ {
2720
+ "epoch": 2.5451495870631344,
2721
+ "grad_norm": 0.9131038188934326,
2722
+ "learning_rate": 3.069199724382321e-05,
2723
+ "loss": 0.7031,
2724
+ "step": 34900
2725
+ },
2726
+ {
2727
+ "epoch": 2.552442070336001,
2728
+ "grad_norm": 0.9127321839332581,
2729
+ "learning_rate": 3.019982281720642e-05,
2730
+ "loss": 0.6979,
2731
+ "step": 35000
2732
+ },
2733
+ {
2734
+ "epoch": 2.552442070336001,
2735
+ "eval_loss": 0.7076790928840637,
2736
+ "eval_runtime": 61.0966,
2737
+ "eval_samples_per_second": 146.571,
2738
+ "eval_steps_per_second": 18.332,
2739
+ "step": 35000
2740
+ },
2741
+ {
2742
+ "epoch": 2.559734553608868,
2743
+ "grad_norm": 0.9567495584487915,
2744
+ "learning_rate": 2.9707648390589628e-05,
2745
+ "loss": 0.7053,
2746
+ "step": 35100
2747
+ },
2748
+ {
2749
+ "epoch": 2.567027036881734,
2750
+ "grad_norm": 0.9740573763847351,
2751
+ "learning_rate": 2.9215473963972833e-05,
2752
+ "loss": 0.7077,
2753
+ "step": 35200
2754
+ },
2755
+ {
2756
+ "epoch": 2.5743195201546007,
2757
+ "grad_norm": 0.8982974886894226,
2758
+ "learning_rate": 2.8723299537356042e-05,
2759
+ "loss": 0.6983,
2760
+ "step": 35300
2761
+ },
2762
+ {
2763
+ "epoch": 2.581612003427467,
2764
+ "grad_norm": 1.0185188055038452,
2765
+ "learning_rate": 2.8231125110739248e-05,
2766
+ "loss": 0.7069,
2767
+ "step": 35400
2768
+ },
2769
+ {
2770
+ "epoch": 2.5889044867003337,
2771
+ "grad_norm": 0.94049471616745,
2772
+ "learning_rate": 2.7738950684122457e-05,
2773
+ "loss": 0.7054,
2774
+ "step": 35500
2775
+ },
2776
+ {
2777
+ "epoch": 2.5961969699732004,
2778
+ "grad_norm": 0.8923749923706055,
2779
+ "learning_rate": 2.7246776257505662e-05,
2780
+ "loss": 0.7015,
2781
+ "step": 35600
2782
+ },
2783
+ {
2784
+ "epoch": 2.6034894532460666,
2785
+ "grad_norm": 0.9568887948989868,
2786
+ "learning_rate": 2.675460183088887e-05,
2787
+ "loss": 0.7025,
2788
+ "step": 35700
2789
+ },
2790
+ {
2791
+ "epoch": 2.610781936518933,
2792
+ "grad_norm": 0.9106321334838867,
2793
+ "learning_rate": 2.6262427404272077e-05,
2794
+ "loss": 0.7049,
2795
+ "step": 35800
2796
+ },
2797
+ {
2798
+ "epoch": 2.6180744197917996,
2799
+ "grad_norm": 0.9499268531799316,
2800
+ "learning_rate": 2.5770252977655285e-05,
2801
+ "loss": 0.7021,
2802
+ "step": 35900
2803
+ },
2804
+ {
2805
+ "epoch": 2.6253669030646662,
2806
+ "grad_norm": 0.8965421915054321,
2807
+ "learning_rate": 2.5278078551038488e-05,
2808
+ "loss": 0.7036,
2809
+ "step": 36000
2810
+ },
2811
+ {
2812
+ "epoch": 2.6253669030646662,
2813
+ "eval_loss": 0.7065343856811523,
2814
+ "eval_runtime": 61.0446,
2815
+ "eval_samples_per_second": 146.696,
2816
+ "eval_steps_per_second": 18.347,
2817
+ "step": 36000
2818
+ },
2819
+ {
2820
+ "epoch": 2.6326593863375325,
2821
+ "grad_norm": 0.94576096534729,
2822
+ "learning_rate": 2.4785904124421696e-05,
2823
+ "loss": 0.71,
2824
+ "step": 36100
2825
+ },
2826
+ {
2827
+ "epoch": 2.639951869610399,
2828
+ "grad_norm": 0.962692141532898,
2829
+ "learning_rate": 2.4293729697804905e-05,
2830
+ "loss": 0.6953,
2831
+ "step": 36200
2832
+ },
2833
+ {
2834
+ "epoch": 2.6472443528832654,
2835
+ "grad_norm": 0.9457094669342041,
2836
+ "learning_rate": 2.380155527118811e-05,
2837
+ "loss": 0.7011,
2838
+ "step": 36300
2839
+ },
2840
+ {
2841
+ "epoch": 2.654536836156132,
2842
+ "grad_norm": 0.9523045420646667,
2843
+ "learning_rate": 2.330938084457132e-05,
2844
+ "loss": 0.7093,
2845
+ "step": 36400
2846
+ },
2847
+ {
2848
+ "epoch": 2.661829319428999,
2849
+ "grad_norm": 0.9255204796791077,
2850
+ "learning_rate": 2.2817206417954522e-05,
2851
+ "loss": 0.6979,
2852
+ "step": 36500
2853
+ },
2854
+ {
2855
+ "epoch": 2.669121802701865,
2856
+ "grad_norm": 1.015286922454834,
2857
+ "learning_rate": 2.232503199133773e-05,
2858
+ "loss": 0.7044,
2859
+ "step": 36600
2860
+ },
2861
+ {
2862
+ "epoch": 2.6764142859747313,
2863
+ "grad_norm": 0.8911315202713013,
2864
+ "learning_rate": 2.1832857564720936e-05,
2865
+ "loss": 0.7031,
2866
+ "step": 36700
2867
+ },
2868
+ {
2869
+ "epoch": 2.683706769247598,
2870
+ "grad_norm": 0.9372689127922058,
2871
+ "learning_rate": 2.1340683138104145e-05,
2872
+ "loss": 0.7019,
2873
+ "step": 36800
2874
+ },
2875
+ {
2876
+ "epoch": 2.6909992525204647,
2877
+ "grad_norm": 0.9245051145553589,
2878
+ "learning_rate": 2.084850871148735e-05,
2879
+ "loss": 0.7065,
2880
+ "step": 36900
2881
+ },
2882
+ {
2883
+ "epoch": 2.698291735793331,
2884
+ "grad_norm": 0.917607843875885,
2885
+ "learning_rate": 2.035633428487056e-05,
2886
+ "loss": 0.7016,
2887
+ "step": 37000
2888
+ },
2889
+ {
2890
+ "epoch": 2.698291735793331,
2891
+ "eval_loss": 0.7054994702339172,
2892
+ "eval_runtime": 60.6541,
2893
+ "eval_samples_per_second": 147.64,
2894
+ "eval_steps_per_second": 18.465,
2895
+ "step": 37000
2896
+ },
2897
+ {
2898
+ "epoch": 2.7055842190661976,
2899
+ "grad_norm": 0.9054610729217529,
2900
+ "learning_rate": 1.9864159858253765e-05,
2901
+ "loss": 0.7034,
2902
+ "step": 37100
2903
+ },
2904
+ {
2905
+ "epoch": 2.712876702339064,
2906
+ "grad_norm": 0.960075855255127,
2907
+ "learning_rate": 1.9371985431636974e-05,
2908
+ "loss": 0.7097,
2909
+ "step": 37200
2910
+ },
2911
+ {
2912
+ "epoch": 2.7201691856119306,
2913
+ "grad_norm": 0.9454420208930969,
2914
+ "learning_rate": 1.887981100502018e-05,
2915
+ "loss": 0.7046,
2916
+ "step": 37300
2917
+ },
2918
+ {
2919
+ "epoch": 2.7274616688847972,
2920
+ "grad_norm": 0.8761453628540039,
2921
+ "learning_rate": 1.8387636578403385e-05,
2922
+ "loss": 0.7068,
2923
+ "step": 37400
2924
+ },
2925
+ {
2926
+ "epoch": 2.7347541521576635,
2927
+ "grad_norm": 0.9231957793235779,
2928
+ "learning_rate": 1.7895462151786594e-05,
2929
+ "loss": 0.6983,
2930
+ "step": 37500
2931
+ },
2932
+ {
2933
+ "epoch": 2.7420466354305297,
2934
+ "grad_norm": 0.8630309104919434,
2935
+ "learning_rate": 1.74032877251698e-05,
2936
+ "loss": 0.6984,
2937
+ "step": 37600
2938
+ },
2939
+ {
2940
+ "epoch": 2.7493391187033964,
2941
+ "grad_norm": 0.9077728986740112,
2942
+ "learning_rate": 1.691111329855301e-05,
2943
+ "loss": 0.7097,
2944
+ "step": 37700
2945
+ },
2946
+ {
2947
+ "epoch": 2.756631601976263,
2948
+ "grad_norm": 0.9849316477775574,
2949
+ "learning_rate": 1.6418938871936214e-05,
2950
+ "loss": 0.7025,
2951
+ "step": 37800
2952
+ },
2953
+ {
2954
+ "epoch": 2.7639240852491294,
2955
+ "grad_norm": 0.9101927280426025,
2956
+ "learning_rate": 1.5926764445319423e-05,
2957
+ "loss": 0.7127,
2958
+ "step": 37900
2959
+ },
2960
+ {
2961
+ "epoch": 2.771216568521996,
2962
+ "grad_norm": 0.9624613523483276,
2963
+ "learning_rate": 1.543459001870263e-05,
2964
+ "loss": 0.7038,
2965
+ "step": 38000
2966
+ },
2967
+ {
2968
+ "epoch": 2.771216568521996,
2969
+ "eval_loss": 0.7042670845985413,
2970
+ "eval_runtime": 60.6288,
2971
+ "eval_samples_per_second": 147.702,
2972
+ "eval_steps_per_second": 18.473,
2973
+ "step": 38000
2974
+ },
2975
+ {
2976
+ "epoch": 2.7785090517948623,
2977
+ "grad_norm": 0.8926946520805359,
2978
+ "learning_rate": 1.4942415592085838e-05,
2979
+ "loss": 0.6955,
2980
+ "step": 38100
2981
+ },
2982
+ {
2983
+ "epoch": 2.785801535067729,
2984
+ "grad_norm": 0.9353916645050049,
2985
+ "learning_rate": 1.4450241165469041e-05,
2986
+ "loss": 0.7003,
2987
+ "step": 38200
2988
+ },
2989
+ {
2990
+ "epoch": 2.7930940183405957,
2991
+ "grad_norm": 0.9394625425338745,
2992
+ "learning_rate": 1.3958066738852249e-05,
2993
+ "loss": 0.6963,
2994
+ "step": 38300
2995
+ },
2996
+ {
2997
+ "epoch": 2.800386501613462,
2998
+ "grad_norm": 0.8811284303665161,
2999
+ "learning_rate": 1.3465892312235456e-05,
3000
+ "loss": 0.7057,
3001
+ "step": 38400
3002
+ },
3003
+ {
3004
+ "epoch": 2.807678984886328,
3005
+ "grad_norm": 0.9111167788505554,
3006
+ "learning_rate": 1.2973717885618663e-05,
3007
+ "loss": 0.6905,
3008
+ "step": 38500
3009
+ },
3010
+ {
3011
+ "epoch": 2.814971468159195,
3012
+ "grad_norm": 0.9061198830604553,
3013
+ "learning_rate": 1.248154345900187e-05,
3014
+ "loss": 0.6966,
3015
+ "step": 38600
3016
+ },
3017
+ {
3018
+ "epoch": 2.8222639514320615,
3019
+ "grad_norm": 0.917921781539917,
3020
+ "learning_rate": 1.1989369032385078e-05,
3021
+ "loss": 0.7055,
3022
+ "step": 38700
3023
+ },
3024
+ {
3025
+ "epoch": 2.829556434704928,
3026
+ "grad_norm": 0.9210913777351379,
3027
+ "learning_rate": 1.1497194605768285e-05,
3028
+ "loss": 0.7004,
3029
+ "step": 38800
3030
+ },
3031
+ {
3032
+ "epoch": 2.8368489179777945,
3033
+ "grad_norm": 0.9152899384498596,
3034
+ "learning_rate": 1.1005020179151492e-05,
3035
+ "loss": 0.7065,
3036
+ "step": 38900
3037
+ },
3038
+ {
3039
+ "epoch": 2.8441414012506607,
3040
+ "grad_norm": 0.9237668514251709,
3041
+ "learning_rate": 1.05128457525347e-05,
3042
+ "loss": 0.7027,
3043
+ "step": 39000
3044
+ },
3045
+ {
3046
+ "epoch": 2.8441414012506607,
3047
+ "eval_loss": 0.7034493088722229,
3048
+ "eval_runtime": 60.6775,
3049
+ "eval_samples_per_second": 147.583,
3050
+ "eval_steps_per_second": 18.458,
3051
+ "step": 39000
3052
+ },
3053
+ {
3054
+ "epoch": 2.8514338845235274,
3055
+ "grad_norm": 0.9577778577804565,
3056
+ "learning_rate": 1.0020671325917906e-05,
3057
+ "loss": 0.7064,
3058
+ "step": 39100
3059
+ },
3060
+ {
3061
+ "epoch": 2.858726367796394,
3062
+ "grad_norm": 0.9955913424491882,
3063
+ "learning_rate": 9.528496899301114e-06,
3064
+ "loss": 0.7017,
3065
+ "step": 39200
3066
+ },
3067
+ {
3068
+ "epoch": 2.8660188510692604,
3069
+ "grad_norm": 0.9187660217285156,
3070
+ "learning_rate": 9.03632247268432e-06,
3071
+ "loss": 0.6998,
3072
+ "step": 39300
3073
+ },
3074
+ {
3075
+ "epoch": 2.8733113343421266,
3076
+ "grad_norm": 0.9275550842285156,
3077
+ "learning_rate": 8.544148046067526e-06,
3078
+ "loss": 0.7002,
3079
+ "step": 39400
3080
+ },
3081
+ {
3082
+ "epoch": 2.8806038176149933,
3083
+ "grad_norm": 0.9114721417427063,
3084
+ "learning_rate": 8.051973619450734e-06,
3085
+ "loss": 0.7027,
3086
+ "step": 39500
3087
+ },
3088
+ {
3089
+ "epoch": 2.88789630088786,
3090
+ "grad_norm": 0.9408327341079712,
3091
+ "learning_rate": 7.559799192833941e-06,
3092
+ "loss": 0.7034,
3093
+ "step": 39600
3094
+ },
3095
+ {
3096
+ "epoch": 2.8951887841607262,
3097
+ "grad_norm": 0.9538366198539734,
3098
+ "learning_rate": 7.067624766217147e-06,
3099
+ "loss": 0.7007,
3100
+ "step": 39700
3101
+ },
3102
+ {
3103
+ "epoch": 2.902481267433593,
3104
+ "grad_norm": 0.923864483833313,
3105
+ "learning_rate": 6.5754503396003544e-06,
3106
+ "loss": 0.6972,
3107
+ "step": 39800
3108
+ },
3109
+ {
3110
+ "epoch": 2.909773750706459,
3111
+ "grad_norm": 0.9156636595726013,
3112
+ "learning_rate": 6.083275912983562e-06,
3113
+ "loss": 0.7064,
3114
+ "step": 39900
3115
+ },
3116
+ {
3117
+ "epoch": 2.917066233979326,
3118
+ "grad_norm": 0.9568312168121338,
3119
+ "learning_rate": 5.591101486366768e-06,
3120
+ "loss": 0.6969,
3121
+ "step": 40000
3122
+ },
3123
+ {
3124
+ "epoch": 2.917066233979326,
3125
+ "eval_loss": 0.7027888894081116,
3126
+ "eval_runtime": 61.1155,
3127
+ "eval_samples_per_second": 146.526,
3128
+ "eval_steps_per_second": 18.326,
3129
+ "step": 40000
3130
+ }
3131
+ ],
3132
+ "logging_steps": 100,
3133
+ "max_steps": 41136,
3134
+ "num_input_tokens_seen": 0,
3135
+ "num_train_epochs": 3,
3136
+ "save_steps": 1000,
3137
+ "stateful_callbacks": {
3138
+ "TrainerControl": {
3139
+ "args": {
3140
+ "should_epoch_stop": false,
3141
+ "should_evaluate": false,
3142
+ "should_log": false,
3143
+ "should_save": true,
3144
+ "should_training_stop": false
3145
+ },
3146
+ "attributes": {}
3147
+ }
3148
+ },
3149
+ "total_flos": 7.07908337664e+17,
3150
+ "train_batch_size": 8,
3151
+ "trial_name": null,
3152
+ "trial_params": null
3153
+ }
checkpoint-40000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ffa18fa243cccfbf729510f7d83fcb184f78dfbd7718a3073ec148d996a46094
3
+ size 5713
checkpoint-41000/adapter_config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "Qwen/Qwen2-0.5B-Instruct",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 32,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 16,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "k_proj",
24
+ "q_proj",
25
+ "o_proj",
26
+ "v_proj"
27
+ ],
28
+ "task_type": "CAUSAL_LM",
29
+ "use_dora": false,
30
+ "use_rslora": false
31
+ }
checkpoint-41000/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:017b6d860b5fcff897c917d1a4d7a6873cc30ff17c4c60eb2ea531f2316f5089
3
+ size 8676008
checkpoint-41000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c58566385e27a03801d627eadfb8c636afe591b8b5676a485b59a3c481f608a
3
+ size 8878080
checkpoint-41000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ceccadd7f022265a48e33d102c69d7418bb357fd78b83f973f75d540e8752845
3
+ size 1465
checkpoint-41000/trainer_state.json ADDED
@@ -0,0 +1,3231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.989991066707991,
5
+ "eval_steps": 1000,
6
+ "global_step": 41000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.007292483272866493,
13
+ "grad_norm": 2.1235318183898926,
14
+ "learning_rate": 4e-05,
15
+ "loss": 2.7429,
16
+ "step": 100
17
+ },
18
+ {
19
+ "epoch": 0.014584966545732986,
20
+ "grad_norm": 1.9533482789993286,
21
+ "learning_rate": 8e-05,
22
+ "loss": 1.4786,
23
+ "step": 200
24
+ },
25
+ {
26
+ "epoch": 0.02187744981859948,
27
+ "grad_norm": 1.5908012390136719,
28
+ "learning_rate": 0.00012,
29
+ "loss": 1.252,
30
+ "step": 300
31
+ },
32
+ {
33
+ "epoch": 0.029169933091465972,
34
+ "grad_norm": 1.592781662940979,
35
+ "learning_rate": 0.00016,
36
+ "loss": 1.1674,
37
+ "step": 400
38
+ },
39
+ {
40
+ "epoch": 0.036462416364332464,
41
+ "grad_norm": 1.4071415662765503,
42
+ "learning_rate": 0.0002,
43
+ "loss": 1.101,
44
+ "step": 500
45
+ },
46
+ {
47
+ "epoch": 0.04375489963719896,
48
+ "grad_norm": 1.4228886365890503,
49
+ "learning_rate": 0.0001995078255733832,
50
+ "loss": 1.0487,
51
+ "step": 600
52
+ },
53
+ {
54
+ "epoch": 0.05104738291006545,
55
+ "grad_norm": 1.2705847024917603,
56
+ "learning_rate": 0.00019901565114676642,
57
+ "loss": 1.0119,
58
+ "step": 700
59
+ },
60
+ {
61
+ "epoch": 0.058339866182931945,
62
+ "grad_norm": 1.1770137548446655,
63
+ "learning_rate": 0.00019852347672014964,
64
+ "loss": 0.9906,
65
+ "step": 800
66
+ },
67
+ {
68
+ "epoch": 0.06563234945579843,
69
+ "grad_norm": 1.1681164503097534,
70
+ "learning_rate": 0.00019803130229353283,
71
+ "loss": 0.9645,
72
+ "step": 900
73
+ },
74
+ {
75
+ "epoch": 0.07292483272866493,
76
+ "grad_norm": 1.020504117012024,
77
+ "learning_rate": 0.00019753912786691605,
78
+ "loss": 0.9525,
79
+ "step": 1000
80
+ },
81
+ {
82
+ "epoch": 0.07292483272866493,
83
+ "eval_loss": 0.9407642483711243,
84
+ "eval_runtime": 61.0906,
85
+ "eval_samples_per_second": 146.586,
86
+ "eval_steps_per_second": 18.333,
87
+ "step": 1000
88
+ },
89
+ {
90
+ "epoch": 0.08021731600153142,
91
+ "grad_norm": 1.079444408416748,
92
+ "learning_rate": 0.00019704695344029924,
93
+ "loss": 0.9414,
94
+ "step": 1100
95
+ },
96
+ {
97
+ "epoch": 0.08750979927439792,
98
+ "grad_norm": 1.057377576828003,
99
+ "learning_rate": 0.00019655477901368246,
100
+ "loss": 0.9231,
101
+ "step": 1200
102
+ },
103
+ {
104
+ "epoch": 0.0948022825472644,
105
+ "grad_norm": 1.068018913269043,
106
+ "learning_rate": 0.00019606260458706568,
107
+ "loss": 0.9168,
108
+ "step": 1300
109
+ },
110
+ {
111
+ "epoch": 0.1020947658201309,
112
+ "grad_norm": 0.9460920095443726,
113
+ "learning_rate": 0.00019557043016044887,
114
+ "loss": 0.9031,
115
+ "step": 1400
116
+ },
117
+ {
118
+ "epoch": 0.1093872490929974,
119
+ "grad_norm": 1.056226134300232,
120
+ "learning_rate": 0.00019507825573383206,
121
+ "loss": 0.8901,
122
+ "step": 1500
123
+ },
124
+ {
125
+ "epoch": 0.11667973236586389,
126
+ "grad_norm": 1.0429835319519043,
127
+ "learning_rate": 0.00019458608130721528,
128
+ "loss": 0.8928,
129
+ "step": 1600
130
+ },
131
+ {
132
+ "epoch": 0.12397221563873038,
133
+ "grad_norm": 1.050790548324585,
134
+ "learning_rate": 0.0001940939068805985,
135
+ "loss": 0.8803,
136
+ "step": 1700
137
+ },
138
+ {
139
+ "epoch": 0.13126469891159687,
140
+ "grad_norm": 0.9586555361747742,
141
+ "learning_rate": 0.0001936017324539817,
142
+ "loss": 0.8809,
143
+ "step": 1800
144
+ },
145
+ {
146
+ "epoch": 0.13855718218446336,
147
+ "grad_norm": 0.985379159450531,
148
+ "learning_rate": 0.00019310955802736491,
149
+ "loss": 0.8743,
150
+ "step": 1900
151
+ },
152
+ {
153
+ "epoch": 0.14584966545732986,
154
+ "grad_norm": 0.9307010769844055,
155
+ "learning_rate": 0.00019261738360074813,
156
+ "loss": 0.8727,
157
+ "step": 2000
158
+ },
159
+ {
160
+ "epoch": 0.14584966545732986,
161
+ "eval_loss": 0.86456698179245,
162
+ "eval_runtime": 60.6283,
163
+ "eval_samples_per_second": 147.703,
164
+ "eval_steps_per_second": 18.473,
165
+ "step": 2000
166
+ },
167
+ {
168
+ "epoch": 0.15314214873019635,
169
+ "grad_norm": 1.0384063720703125,
170
+ "learning_rate": 0.00019212520917413133,
171
+ "loss": 0.8742,
172
+ "step": 2100
173
+ },
174
+ {
175
+ "epoch": 0.16043463200306285,
176
+ "grad_norm": 0.9662402868270874,
177
+ "learning_rate": 0.00019163303474751452,
178
+ "loss": 0.8661,
179
+ "step": 2200
180
+ },
181
+ {
182
+ "epoch": 0.16772711527592934,
183
+ "grad_norm": 0.9773098230361938,
184
+ "learning_rate": 0.00019114086032089774,
185
+ "loss": 0.8576,
186
+ "step": 2300
187
+ },
188
+ {
189
+ "epoch": 0.17501959854879584,
190
+ "grad_norm": 0.9672012329101562,
191
+ "learning_rate": 0.00019064868589428093,
192
+ "loss": 0.8595,
193
+ "step": 2400
194
+ },
195
+ {
196
+ "epoch": 0.1823120818216623,
197
+ "grad_norm": 0.9758124351501465,
198
+ "learning_rate": 0.00019015651146766415,
199
+ "loss": 0.8524,
200
+ "step": 2500
201
+ },
202
+ {
203
+ "epoch": 0.1896045650945288,
204
+ "grad_norm": 0.972232460975647,
205
+ "learning_rate": 0.00018966433704104737,
206
+ "loss": 0.8468,
207
+ "step": 2600
208
+ },
209
+ {
210
+ "epoch": 0.1968970483673953,
211
+ "grad_norm": 0.9417553544044495,
212
+ "learning_rate": 0.00018917216261443056,
213
+ "loss": 0.8412,
214
+ "step": 2700
215
+ },
216
+ {
217
+ "epoch": 0.2041895316402618,
218
+ "grad_norm": 0.9395071864128113,
219
+ "learning_rate": 0.00018867998818781375,
220
+ "loss": 0.8413,
221
+ "step": 2800
222
+ },
223
+ {
224
+ "epoch": 0.2114820149131283,
225
+ "grad_norm": 0.9951208233833313,
226
+ "learning_rate": 0.000188187813761197,
227
+ "loss": 0.8345,
228
+ "step": 2900
229
+ },
230
+ {
231
+ "epoch": 0.2187744981859948,
232
+ "grad_norm": 0.9656242728233337,
233
+ "learning_rate": 0.0001876956393345802,
234
+ "loss": 0.8317,
235
+ "step": 3000
236
+ },
237
+ {
238
+ "epoch": 0.2187744981859948,
239
+ "eval_loss": 0.8318613767623901,
240
+ "eval_runtime": 61.1356,
241
+ "eval_samples_per_second": 146.478,
242
+ "eval_steps_per_second": 18.32,
243
+ "step": 3000
244
+ },
245
+ {
246
+ "epoch": 0.22606698145886128,
247
+ "grad_norm": 0.8810185194015503,
248
+ "learning_rate": 0.00018720346490796338,
249
+ "loss": 0.8321,
250
+ "step": 3100
251
+ },
252
+ {
253
+ "epoch": 0.23335946473172778,
254
+ "grad_norm": 0.9199262857437134,
255
+ "learning_rate": 0.0001867112904813466,
256
+ "loss": 0.8406,
257
+ "step": 3200
258
+ },
259
+ {
260
+ "epoch": 0.24065194800459427,
261
+ "grad_norm": 0.9557051658630371,
262
+ "learning_rate": 0.00018621911605472982,
263
+ "loss": 0.8277,
264
+ "step": 3300
265
+ },
266
+ {
267
+ "epoch": 0.24794443127746077,
268
+ "grad_norm": 0.9777804017066956,
269
+ "learning_rate": 0.000185726941628113,
270
+ "loss": 0.8272,
271
+ "step": 3400
272
+ },
273
+ {
274
+ "epoch": 0.25523691455032727,
275
+ "grad_norm": 0.8856322169303894,
276
+ "learning_rate": 0.00018523476720149623,
277
+ "loss": 0.8256,
278
+ "step": 3500
279
+ },
280
+ {
281
+ "epoch": 0.26252939782319373,
282
+ "grad_norm": 0.9196017980575562,
283
+ "learning_rate": 0.00018474259277487942,
284
+ "loss": 0.8234,
285
+ "step": 3600
286
+ },
287
+ {
288
+ "epoch": 0.26982188109606026,
289
+ "grad_norm": 0.9568464159965515,
290
+ "learning_rate": 0.00018425041834826264,
291
+ "loss": 0.8193,
292
+ "step": 3700
293
+ },
294
+ {
295
+ "epoch": 0.2771143643689267,
296
+ "grad_norm": 0.9552770256996155,
297
+ "learning_rate": 0.00018375824392164583,
298
+ "loss": 0.8179,
299
+ "step": 3800
300
+ },
301
+ {
302
+ "epoch": 0.28440684764179325,
303
+ "grad_norm": 0.8997077345848083,
304
+ "learning_rate": 0.00018326606949502905,
305
+ "loss": 0.8138,
306
+ "step": 3900
307
+ },
308
+ {
309
+ "epoch": 0.2916993309146597,
310
+ "grad_norm": 0.8896480202674866,
311
+ "learning_rate": 0.00018277389506841224,
312
+ "loss": 0.8172,
313
+ "step": 4000
314
+ },
315
+ {
316
+ "epoch": 0.2916993309146597,
317
+ "eval_loss": 0.8123040199279785,
318
+ "eval_runtime": 60.7914,
319
+ "eval_samples_per_second": 147.307,
320
+ "eval_steps_per_second": 18.424,
321
+ "step": 4000
322
+ },
323
+ {
324
+ "epoch": 0.2989918141875262,
325
+ "grad_norm": 0.9520764350891113,
326
+ "learning_rate": 0.00018228172064179546,
327
+ "loss": 0.8183,
328
+ "step": 4100
329
+ },
330
+ {
331
+ "epoch": 0.3062842974603927,
332
+ "grad_norm": 0.9373065233230591,
333
+ "learning_rate": 0.00018178954621517868,
334
+ "loss": 0.8132,
335
+ "step": 4200
336
+ },
337
+ {
338
+ "epoch": 0.3135767807332592,
339
+ "grad_norm": 0.8733066916465759,
340
+ "learning_rate": 0.00018129737178856187,
341
+ "loss": 0.811,
342
+ "step": 4300
343
+ },
344
+ {
345
+ "epoch": 0.3208692640061257,
346
+ "grad_norm": 0.8866516351699829,
347
+ "learning_rate": 0.00018080519736194507,
348
+ "loss": 0.8093,
349
+ "step": 4400
350
+ },
351
+ {
352
+ "epoch": 0.32816174727899217,
353
+ "grad_norm": 0.9394953846931458,
354
+ "learning_rate": 0.00018031302293532828,
355
+ "loss": 0.8035,
356
+ "step": 4500
357
+ },
358
+ {
359
+ "epoch": 0.3354542305518587,
360
+ "grad_norm": 0.9133720993995667,
361
+ "learning_rate": 0.0001798208485087115,
362
+ "loss": 0.8054,
363
+ "step": 4600
364
+ },
365
+ {
366
+ "epoch": 0.34274671382472516,
367
+ "grad_norm": 0.9428606629371643,
368
+ "learning_rate": 0.0001793286740820947,
369
+ "loss": 0.8076,
370
+ "step": 4700
371
+ },
372
+ {
373
+ "epoch": 0.3500391970975917,
374
+ "grad_norm": 0.8996593356132507,
375
+ "learning_rate": 0.00017883649965547792,
376
+ "loss": 0.812,
377
+ "step": 4800
378
+ },
379
+ {
380
+ "epoch": 0.35733168037045815,
381
+ "grad_norm": 0.9113749265670776,
382
+ "learning_rate": 0.0001783443252288611,
383
+ "loss": 0.8048,
384
+ "step": 4900
385
+ },
386
+ {
387
+ "epoch": 0.3646241636433246,
388
+ "grad_norm": 0.9185646176338196,
389
+ "learning_rate": 0.00017785215080224433,
390
+ "loss": 0.8023,
391
+ "step": 5000
392
+ },
393
+ {
394
+ "epoch": 0.3646241636433246,
395
+ "eval_loss": 0.7973803877830505,
396
+ "eval_runtime": 60.8068,
397
+ "eval_samples_per_second": 147.27,
398
+ "eval_steps_per_second": 18.419,
399
+ "step": 5000
400
+ },
401
+ {
402
+ "epoch": 0.37191664691619114,
403
+ "grad_norm": 0.8994658589363098,
404
+ "learning_rate": 0.00017735997637562755,
405
+ "loss": 0.8089,
406
+ "step": 5100
407
+ },
408
+ {
409
+ "epoch": 0.3792091301890576,
410
+ "grad_norm": 0.8724523782730103,
411
+ "learning_rate": 0.00017686780194901074,
412
+ "loss": 0.8015,
413
+ "step": 5200
414
+ },
415
+ {
416
+ "epoch": 0.38650161346192413,
417
+ "grad_norm": 0.8285540342330933,
418
+ "learning_rate": 0.00017637562752239393,
419
+ "loss": 0.7944,
420
+ "step": 5300
421
+ },
422
+ {
423
+ "epoch": 0.3937940967347906,
424
+ "grad_norm": 0.8982509970664978,
425
+ "learning_rate": 0.00017588345309577718,
426
+ "loss": 0.7952,
427
+ "step": 5400
428
+ },
429
+ {
430
+ "epoch": 0.4010865800076571,
431
+ "grad_norm": 0.9266172051429749,
432
+ "learning_rate": 0.00017539127866916037,
433
+ "loss": 0.7978,
434
+ "step": 5500
435
+ },
436
+ {
437
+ "epoch": 0.4083790632805236,
438
+ "grad_norm": 0.901662290096283,
439
+ "learning_rate": 0.00017489910424254356,
440
+ "loss": 0.7966,
441
+ "step": 5600
442
+ },
443
+ {
444
+ "epoch": 0.4156715465533901,
445
+ "grad_norm": 0.9309051036834717,
446
+ "learning_rate": 0.00017440692981592678,
447
+ "loss": 0.7975,
448
+ "step": 5700
449
+ },
450
+ {
451
+ "epoch": 0.4229640298262566,
452
+ "grad_norm": 0.8789328336715698,
453
+ "learning_rate": 0.00017391475538930997,
454
+ "loss": 0.7997,
455
+ "step": 5800
456
+ },
457
+ {
458
+ "epoch": 0.4302565130991231,
459
+ "grad_norm": 0.8636139035224915,
460
+ "learning_rate": 0.0001734225809626932,
461
+ "loss": 0.7914,
462
+ "step": 5900
463
+ },
464
+ {
465
+ "epoch": 0.4375489963719896,
466
+ "grad_norm": 0.9468287229537964,
467
+ "learning_rate": 0.00017293040653607638,
468
+ "loss": 0.7859,
469
+ "step": 6000
470
+ },
471
+ {
472
+ "epoch": 0.4375489963719896,
473
+ "eval_loss": 0.7869976162910461,
474
+ "eval_runtime": 60.7741,
475
+ "eval_samples_per_second": 147.349,
476
+ "eval_steps_per_second": 18.429,
477
+ "step": 6000
478
+ },
479
+ {
480
+ "epoch": 0.44484147964485604,
481
+ "grad_norm": 0.867158055305481,
482
+ "learning_rate": 0.0001724382321094596,
483
+ "loss": 0.7924,
484
+ "step": 6100
485
+ },
486
+ {
487
+ "epoch": 0.45213396291772256,
488
+ "grad_norm": 0.9379836320877075,
489
+ "learning_rate": 0.0001719460576828428,
490
+ "loss": 0.7902,
491
+ "step": 6200
492
+ },
493
+ {
494
+ "epoch": 0.45942644619058903,
495
+ "grad_norm": 0.8591951727867126,
496
+ "learning_rate": 0.000171453883256226,
497
+ "loss": 0.7926,
498
+ "step": 6300
499
+ },
500
+ {
501
+ "epoch": 0.46671892946345556,
502
+ "grad_norm": 0.9702317118644714,
503
+ "learning_rate": 0.00017096170882960923,
504
+ "loss": 0.7867,
505
+ "step": 6400
506
+ },
507
+ {
508
+ "epoch": 0.474011412736322,
509
+ "grad_norm": 0.902302086353302,
510
+ "learning_rate": 0.00017046953440299242,
511
+ "loss": 0.7897,
512
+ "step": 6500
513
+ },
514
+ {
515
+ "epoch": 0.48130389600918855,
516
+ "grad_norm": 0.889926552772522,
517
+ "learning_rate": 0.00016997735997637561,
518
+ "loss": 0.7857,
519
+ "step": 6600
520
+ },
521
+ {
522
+ "epoch": 0.488596379282055,
523
+ "grad_norm": 0.8906420469284058,
524
+ "learning_rate": 0.00016948518554975886,
525
+ "loss": 0.7878,
526
+ "step": 6700
527
+ },
528
+ {
529
+ "epoch": 0.49588886255492154,
530
+ "grad_norm": 0.919983983039856,
531
+ "learning_rate": 0.00016899301112314205,
532
+ "loss": 0.7876,
533
+ "step": 6800
534
+ },
535
+ {
536
+ "epoch": 0.5031813458277881,
537
+ "grad_norm": 0.8610624670982361,
538
+ "learning_rate": 0.00016850083669652524,
539
+ "loss": 0.7923,
540
+ "step": 6900
541
+ },
542
+ {
543
+ "epoch": 0.5104738291006545,
544
+ "grad_norm": 0.9339637160301208,
545
+ "learning_rate": 0.00016800866226990846,
546
+ "loss": 0.7837,
547
+ "step": 7000
548
+ },
549
+ {
550
+ "epoch": 0.5104738291006545,
551
+ "eval_loss": 0.7791191935539246,
552
+ "eval_runtime": 60.8878,
553
+ "eval_samples_per_second": 147.074,
554
+ "eval_steps_per_second": 18.395,
555
+ "step": 7000
556
+ },
557
+ {
558
+ "epoch": 0.517766312373521,
559
+ "grad_norm": 0.9073446393013,
560
+ "learning_rate": 0.00016751648784329168,
561
+ "loss": 0.7809,
562
+ "step": 7100
563
+ },
564
+ {
565
+ "epoch": 0.5250587956463875,
566
+ "grad_norm": 0.9348235726356506,
567
+ "learning_rate": 0.00016702431341667487,
568
+ "loss": 0.7793,
569
+ "step": 7200
570
+ },
571
+ {
572
+ "epoch": 0.5323512789192539,
573
+ "grad_norm": 0.9155163168907166,
574
+ "learning_rate": 0.0001665321389900581,
575
+ "loss": 0.7821,
576
+ "step": 7300
577
+ },
578
+ {
579
+ "epoch": 0.5396437621921205,
580
+ "grad_norm": 0.9328250885009766,
581
+ "learning_rate": 0.00016603996456344129,
582
+ "loss": 0.7806,
583
+ "step": 7400
584
+ },
585
+ {
586
+ "epoch": 0.546936245464987,
587
+ "grad_norm": 0.8911275863647461,
588
+ "learning_rate": 0.00016554779013682448,
589
+ "loss": 0.7782,
590
+ "step": 7500
591
+ },
592
+ {
593
+ "epoch": 0.5542287287378534,
594
+ "grad_norm": 0.8989250659942627,
595
+ "learning_rate": 0.00016505561571020772,
596
+ "loss": 0.779,
597
+ "step": 7600
598
+ },
599
+ {
600
+ "epoch": 0.5615212120107199,
601
+ "grad_norm": 0.8869723081588745,
602
+ "learning_rate": 0.00016456344128359092,
603
+ "loss": 0.7822,
604
+ "step": 7700
605
+ },
606
+ {
607
+ "epoch": 0.5688136952835865,
608
+ "grad_norm": 0.8631371259689331,
609
+ "learning_rate": 0.0001640712668569741,
610
+ "loss": 0.7768,
611
+ "step": 7800
612
+ },
613
+ {
614
+ "epoch": 0.576106178556453,
615
+ "grad_norm": 0.8868420720100403,
616
+ "learning_rate": 0.00016357909243035733,
617
+ "loss": 0.7834,
618
+ "step": 7900
619
+ },
620
+ {
621
+ "epoch": 0.5833986618293194,
622
+ "grad_norm": 0.9253202080726624,
623
+ "learning_rate": 0.00016308691800374055,
624
+ "loss": 0.773,
625
+ "step": 8000
626
+ },
627
+ {
628
+ "epoch": 0.5833986618293194,
629
+ "eval_loss": 0.7733862400054932,
630
+ "eval_runtime": 60.8911,
631
+ "eval_samples_per_second": 147.066,
632
+ "eval_steps_per_second": 18.394,
633
+ "step": 8000
634
+ },
635
+ {
636
+ "epoch": 0.5906911451021859,
637
+ "grad_norm": 0.830760657787323,
638
+ "learning_rate": 0.00016259474357712374,
639
+ "loss": 0.7756,
640
+ "step": 8100
641
+ },
642
+ {
643
+ "epoch": 0.5979836283750524,
644
+ "grad_norm": 0.9371838569641113,
645
+ "learning_rate": 0.00016210256915050696,
646
+ "loss": 0.776,
647
+ "step": 8200
648
+ },
649
+ {
650
+ "epoch": 0.605276111647919,
651
+ "grad_norm": 0.8486947417259216,
652
+ "learning_rate": 0.00016161039472389015,
653
+ "loss": 0.7758,
654
+ "step": 8300
655
+ },
656
+ {
657
+ "epoch": 0.6125685949207854,
658
+ "grad_norm": 0.8888623118400574,
659
+ "learning_rate": 0.00016111822029727337,
660
+ "loss": 0.783,
661
+ "step": 8400
662
+ },
663
+ {
664
+ "epoch": 0.6198610781936519,
665
+ "grad_norm": 0.9176976084709167,
666
+ "learning_rate": 0.00016062604587065656,
667
+ "loss": 0.7782,
668
+ "step": 8500
669
+ },
670
+ {
671
+ "epoch": 0.6271535614665184,
672
+ "grad_norm": 0.90993732213974,
673
+ "learning_rate": 0.00016013387144403978,
674
+ "loss": 0.7741,
675
+ "step": 8600
676
+ },
677
+ {
678
+ "epoch": 0.6344460447393849,
679
+ "grad_norm": 0.8461544513702393,
680
+ "learning_rate": 0.00015964169701742297,
681
+ "loss": 0.7782,
682
+ "step": 8700
683
+ },
684
+ {
685
+ "epoch": 0.6417385280122514,
686
+ "grad_norm": 0.8642047643661499,
687
+ "learning_rate": 0.0001591495225908062,
688
+ "loss": 0.7706,
689
+ "step": 8800
690
+ },
691
+ {
692
+ "epoch": 0.6490310112851179,
693
+ "grad_norm": 0.8944571018218994,
694
+ "learning_rate": 0.0001586573481641894,
695
+ "loss": 0.7727,
696
+ "step": 8900
697
+ },
698
+ {
699
+ "epoch": 0.6563234945579843,
700
+ "grad_norm": 0.9075286984443665,
701
+ "learning_rate": 0.0001581651737375726,
702
+ "loss": 0.7748,
703
+ "step": 9000
704
+ },
705
+ {
706
+ "epoch": 0.6563234945579843,
707
+ "eval_loss": 0.7666329741477966,
708
+ "eval_runtime": 60.5924,
709
+ "eval_samples_per_second": 147.791,
710
+ "eval_steps_per_second": 18.484,
711
+ "step": 9000
712
+ },
713
+ {
714
+ "epoch": 0.6636159778308508,
715
+ "grad_norm": 0.9164955615997314,
716
+ "learning_rate": 0.0001576729993109558,
717
+ "loss": 0.7792,
718
+ "step": 9100
719
+ },
720
+ {
721
+ "epoch": 0.6709084611037174,
722
+ "grad_norm": 0.8446054458618164,
723
+ "learning_rate": 0.000157180824884339,
724
+ "loss": 0.7661,
725
+ "step": 9200
726
+ },
727
+ {
728
+ "epoch": 0.6782009443765838,
729
+ "grad_norm": 0.8793991804122925,
730
+ "learning_rate": 0.00015668865045772223,
731
+ "loss": 0.7678,
732
+ "step": 9300
733
+ },
734
+ {
735
+ "epoch": 0.6854934276494503,
736
+ "grad_norm": 0.8772592544555664,
737
+ "learning_rate": 0.00015619647603110542,
738
+ "loss": 0.7708,
739
+ "step": 9400
740
+ },
741
+ {
742
+ "epoch": 0.6927859109223168,
743
+ "grad_norm": 0.854118824005127,
744
+ "learning_rate": 0.00015570430160448864,
745
+ "loss": 0.7616,
746
+ "step": 9500
747
+ },
748
+ {
749
+ "epoch": 0.7000783941951834,
750
+ "grad_norm": 0.8653910756111145,
751
+ "learning_rate": 0.00015521212717787183,
752
+ "loss": 0.767,
753
+ "step": 9600
754
+ },
755
+ {
756
+ "epoch": 0.7073708774680498,
757
+ "grad_norm": 0.8890120387077332,
758
+ "learning_rate": 0.00015471995275125505,
759
+ "loss": 0.7657,
760
+ "step": 9700
761
+ },
762
+ {
763
+ "epoch": 0.7146633607409163,
764
+ "grad_norm": 0.8451828360557556,
765
+ "learning_rate": 0.00015422777832463827,
766
+ "loss": 0.7656,
767
+ "step": 9800
768
+ },
769
+ {
770
+ "epoch": 0.7219558440137828,
771
+ "grad_norm": 0.9029329419136047,
772
+ "learning_rate": 0.00015373560389802146,
773
+ "loss": 0.7749,
774
+ "step": 9900
775
+ },
776
+ {
777
+ "epoch": 0.7292483272866492,
778
+ "grad_norm": 0.8538834452629089,
779
+ "learning_rate": 0.00015324342947140466,
780
+ "loss": 0.763,
781
+ "step": 10000
782
+ },
783
+ {
784
+ "epoch": 0.7292483272866492,
785
+ "eval_loss": 0.76123046875,
786
+ "eval_runtime": 60.847,
787
+ "eval_samples_per_second": 147.172,
788
+ "eval_steps_per_second": 18.407,
789
+ "step": 10000
790
+ },
791
+ {
792
+ "epoch": 0.7365408105595158,
793
+ "grad_norm": 0.8594367504119873,
794
+ "learning_rate": 0.00015275125504478788,
795
+ "loss": 0.7693,
796
+ "step": 10100
797
+ },
798
+ {
799
+ "epoch": 0.7438332938323823,
800
+ "grad_norm": 0.8748040199279785,
801
+ "learning_rate": 0.0001522590806181711,
802
+ "loss": 0.7684,
803
+ "step": 10200
804
+ },
805
+ {
806
+ "epoch": 0.7511257771052487,
807
+ "grad_norm": 0.9177483320236206,
808
+ "learning_rate": 0.0001517669061915543,
809
+ "loss": 0.7599,
810
+ "step": 10300
811
+ },
812
+ {
813
+ "epoch": 0.7584182603781152,
814
+ "grad_norm": 0.8988757729530334,
815
+ "learning_rate": 0.0001512747317649375,
816
+ "loss": 0.7648,
817
+ "step": 10400
818
+ },
819
+ {
820
+ "epoch": 0.7657107436509818,
821
+ "grad_norm": 0.8735676407814026,
822
+ "learning_rate": 0.00015078255733832073,
823
+ "loss": 0.7656,
824
+ "step": 10500
825
+ },
826
+ {
827
+ "epoch": 0.7730032269238483,
828
+ "grad_norm": 0.8750614523887634,
829
+ "learning_rate": 0.00015029038291170392,
830
+ "loss": 0.7632,
831
+ "step": 10600
832
+ },
833
+ {
834
+ "epoch": 0.7802957101967147,
835
+ "grad_norm": 0.8786306381225586,
836
+ "learning_rate": 0.0001497982084850871,
837
+ "loss": 0.7659,
838
+ "step": 10700
839
+ },
840
+ {
841
+ "epoch": 0.7875881934695812,
842
+ "grad_norm": 0.811834990978241,
843
+ "learning_rate": 0.00014930603405847033,
844
+ "loss": 0.7652,
845
+ "step": 10800
846
+ },
847
+ {
848
+ "epoch": 0.7948806767424477,
849
+ "grad_norm": 0.8844282031059265,
850
+ "learning_rate": 0.00014881385963185352,
851
+ "loss": 0.7623,
852
+ "step": 10900
853
+ },
854
+ {
855
+ "epoch": 0.8021731600153142,
856
+ "grad_norm": 0.8444844484329224,
857
+ "learning_rate": 0.00014832168520523674,
858
+ "loss": 0.7622,
859
+ "step": 11000
860
+ },
861
+ {
862
+ "epoch": 0.8021731600153142,
863
+ "eval_loss": 0.75812828540802,
864
+ "eval_runtime": 60.7569,
865
+ "eval_samples_per_second": 147.391,
866
+ "eval_steps_per_second": 18.434,
867
+ "step": 11000
868
+ },
869
+ {
870
+ "epoch": 0.8094656432881807,
871
+ "grad_norm": 0.8396947979927063,
872
+ "learning_rate": 0.00014782951077861996,
873
+ "loss": 0.7673,
874
+ "step": 11100
875
+ },
876
+ {
877
+ "epoch": 0.8167581265610472,
878
+ "grad_norm": 0.8890758752822876,
879
+ "learning_rate": 0.00014733733635200315,
880
+ "loss": 0.7551,
881
+ "step": 11200
882
+ },
883
+ {
884
+ "epoch": 0.8240506098339136,
885
+ "grad_norm": 0.8038908839225769,
886
+ "learning_rate": 0.00014684516192538634,
887
+ "loss": 0.7612,
888
+ "step": 11300
889
+ },
890
+ {
891
+ "epoch": 0.8313430931067802,
892
+ "grad_norm": 0.8224745392799377,
893
+ "learning_rate": 0.0001463529874987696,
894
+ "loss": 0.7618,
895
+ "step": 11400
896
+ },
897
+ {
898
+ "epoch": 0.8386355763796467,
899
+ "grad_norm": 0.8691264390945435,
900
+ "learning_rate": 0.00014586081307215278,
901
+ "loss": 0.7618,
902
+ "step": 11500
903
+ },
904
+ {
905
+ "epoch": 0.8459280596525132,
906
+ "grad_norm": 0.8442777395248413,
907
+ "learning_rate": 0.00014536863864553597,
908
+ "loss": 0.7671,
909
+ "step": 11600
910
+ },
911
+ {
912
+ "epoch": 0.8532205429253796,
913
+ "grad_norm": 0.8520532846450806,
914
+ "learning_rate": 0.0001448764642189192,
915
+ "loss": 0.7625,
916
+ "step": 11700
917
+ },
918
+ {
919
+ "epoch": 0.8605130261982462,
920
+ "grad_norm": 0.908760666847229,
921
+ "learning_rate": 0.0001443842897923024,
922
+ "loss": 0.7615,
923
+ "step": 11800
924
+ },
925
+ {
926
+ "epoch": 0.8678055094711127,
927
+ "grad_norm": 0.8004080057144165,
928
+ "learning_rate": 0.0001438921153656856,
929
+ "loss": 0.7632,
930
+ "step": 11900
931
+ },
932
+ {
933
+ "epoch": 0.8750979927439791,
934
+ "grad_norm": 0.8449864983558655,
935
+ "learning_rate": 0.00014339994093906882,
936
+ "loss": 0.7574,
937
+ "step": 12000
938
+ },
939
+ {
940
+ "epoch": 0.8750979927439791,
941
+ "eval_loss": 0.752128005027771,
942
+ "eval_runtime": 61.1399,
943
+ "eval_samples_per_second": 146.467,
944
+ "eval_steps_per_second": 18.319,
945
+ "step": 12000
946
+ },
947
+ {
948
+ "epoch": 0.8823904760168456,
949
+ "grad_norm": 0.8218274116516113,
950
+ "learning_rate": 0.00014290776651245201,
951
+ "loss": 0.7555,
952
+ "step": 12100
953
+ },
954
+ {
955
+ "epoch": 0.8896829592897121,
956
+ "grad_norm": 0.8944920897483826,
957
+ "learning_rate": 0.00014241559208583523,
958
+ "loss": 0.7594,
959
+ "step": 12200
960
+ },
961
+ {
962
+ "epoch": 0.8969754425625787,
963
+ "grad_norm": 0.9254937767982483,
964
+ "learning_rate": 0.00014192341765921845,
965
+ "loss": 0.7598,
966
+ "step": 12300
967
+ },
968
+ {
969
+ "epoch": 0.9042679258354451,
970
+ "grad_norm": 0.8887091875076294,
971
+ "learning_rate": 0.00014143124323260164,
972
+ "loss": 0.7625,
973
+ "step": 12400
974
+ },
975
+ {
976
+ "epoch": 0.9115604091083116,
977
+ "grad_norm": 0.8478124737739563,
978
+ "learning_rate": 0.00014093906880598484,
979
+ "loss": 0.756,
980
+ "step": 12500
981
+ },
982
+ {
983
+ "epoch": 0.9188528923811781,
984
+ "grad_norm": 0.9377927780151367,
985
+ "learning_rate": 0.00014044689437936805,
986
+ "loss": 0.7606,
987
+ "step": 12600
988
+ },
989
+ {
990
+ "epoch": 0.9261453756540446,
991
+ "grad_norm": 0.838175892829895,
992
+ "learning_rate": 0.00013995471995275127,
993
+ "loss": 0.7605,
994
+ "step": 12700
995
+ },
996
+ {
997
+ "epoch": 0.9334378589269111,
998
+ "grad_norm": 0.8345216512680054,
999
+ "learning_rate": 0.00013946254552613447,
1000
+ "loss": 0.7568,
1001
+ "step": 12800
1002
+ },
1003
+ {
1004
+ "epoch": 0.9407303421997776,
1005
+ "grad_norm": 0.894477367401123,
1006
+ "learning_rate": 0.00013897037109951766,
1007
+ "loss": 0.7535,
1008
+ "step": 12900
1009
+ },
1010
+ {
1011
+ "epoch": 0.948022825472644,
1012
+ "grad_norm": 0.849010169506073,
1013
+ "learning_rate": 0.00013847819667290088,
1014
+ "loss": 0.7465,
1015
+ "step": 13000
1016
+ },
1017
+ {
1018
+ "epoch": 0.948022825472644,
1019
+ "eval_loss": 0.7492165565490723,
1020
+ "eval_runtime": 60.7079,
1021
+ "eval_samples_per_second": 147.51,
1022
+ "eval_steps_per_second": 18.449,
1023
+ "step": 13000
1024
+ },
1025
+ {
1026
+ "epoch": 0.9553153087455105,
1027
+ "grad_norm": 0.8754207491874695,
1028
+ "learning_rate": 0.0001379860222462841,
1029
+ "loss": 0.7576,
1030
+ "step": 13100
1031
+ },
1032
+ {
1033
+ "epoch": 0.9626077920183771,
1034
+ "grad_norm": 0.8984807133674622,
1035
+ "learning_rate": 0.0001374938478196673,
1036
+ "loss": 0.7493,
1037
+ "step": 13200
1038
+ },
1039
+ {
1040
+ "epoch": 0.9699002752912436,
1041
+ "grad_norm": 0.8458361029624939,
1042
+ "learning_rate": 0.0001370016733930505,
1043
+ "loss": 0.7468,
1044
+ "step": 13300
1045
+ },
1046
+ {
1047
+ "epoch": 0.97719275856411,
1048
+ "grad_norm": 0.9169609546661377,
1049
+ "learning_rate": 0.0001365094989664337,
1050
+ "loss": 0.7515,
1051
+ "step": 13400
1052
+ },
1053
+ {
1054
+ "epoch": 0.9844852418369765,
1055
+ "grad_norm": 0.8027638792991638,
1056
+ "learning_rate": 0.00013601732453981692,
1057
+ "loss": 0.7551,
1058
+ "step": 13500
1059
+ },
1060
+ {
1061
+ "epoch": 0.9917777251098431,
1062
+ "grad_norm": 0.8572927117347717,
1063
+ "learning_rate": 0.00013552515011320014,
1064
+ "loss": 0.7481,
1065
+ "step": 13600
1066
+ },
1067
+ {
1068
+ "epoch": 0.9990702083827095,
1069
+ "grad_norm": 0.8624053001403809,
1070
+ "learning_rate": 0.00013503297568658333,
1071
+ "loss": 0.7481,
1072
+ "step": 13700
1073
+ },
1074
+ {
1075
+ "epoch": 1.0063991540719404,
1076
+ "grad_norm": 0.8915347456932068,
1077
+ "learning_rate": 0.00013454080125996652,
1078
+ "loss": 0.7463,
1079
+ "step": 13800
1080
+ },
1081
+ {
1082
+ "epoch": 1.0136916373448068,
1083
+ "grad_norm": 0.8233557939529419,
1084
+ "learning_rate": 0.00013404862683334977,
1085
+ "loss": 0.7398,
1086
+ "step": 13900
1087
+ },
1088
+ {
1089
+ "epoch": 1.0209841206176733,
1090
+ "grad_norm": 0.8467598557472229,
1091
+ "learning_rate": 0.00013355645240673296,
1092
+ "loss": 0.7402,
1093
+ "step": 14000
1094
+ },
1095
+ {
1096
+ "epoch": 1.0209841206176733,
1097
+ "eval_loss": 0.7458442449569702,
1098
+ "eval_runtime": 60.6887,
1099
+ "eval_samples_per_second": 147.556,
1100
+ "eval_steps_per_second": 18.455,
1101
+ "step": 14000
1102
+ },
1103
+ {
1104
+ "epoch": 1.0282766038905398,
1105
+ "grad_norm": 0.852739691734314,
1106
+ "learning_rate": 0.00013306427798011615,
1107
+ "loss": 0.7436,
1108
+ "step": 14100
1109
+ },
1110
+ {
1111
+ "epoch": 1.0355690871634062,
1112
+ "grad_norm": 0.8501101136207581,
1113
+ "learning_rate": 0.00013257210355349937,
1114
+ "loss": 0.7472,
1115
+ "step": 14200
1116
+ },
1117
+ {
1118
+ "epoch": 1.0428615704362727,
1119
+ "grad_norm": 0.8830447793006897,
1120
+ "learning_rate": 0.0001320799291268826,
1121
+ "loss": 0.7438,
1122
+ "step": 14300
1123
+ },
1124
+ {
1125
+ "epoch": 1.0501540537091394,
1126
+ "grad_norm": 0.8827272057533264,
1127
+ "learning_rate": 0.00013158775470026578,
1128
+ "loss": 0.7439,
1129
+ "step": 14400
1130
+ },
1131
+ {
1132
+ "epoch": 1.0574465369820059,
1133
+ "grad_norm": 0.7875618934631348,
1134
+ "learning_rate": 0.000131095580273649,
1135
+ "loss": 0.7426,
1136
+ "step": 14500
1137
+ },
1138
+ {
1139
+ "epoch": 1.0647390202548723,
1140
+ "grad_norm": 0.9906949996948242,
1141
+ "learning_rate": 0.0001306034058470322,
1142
+ "loss": 0.7418,
1143
+ "step": 14600
1144
+ },
1145
+ {
1146
+ "epoch": 1.0720315035277388,
1147
+ "grad_norm": 0.8803852200508118,
1148
+ "learning_rate": 0.00013011123142041538,
1149
+ "loss": 0.7421,
1150
+ "step": 14700
1151
+ },
1152
+ {
1153
+ "epoch": 1.0793239868006053,
1154
+ "grad_norm": 0.8951194286346436,
1155
+ "learning_rate": 0.0001296190569937986,
1156
+ "loss": 0.7429,
1157
+ "step": 14800
1158
+ },
1159
+ {
1160
+ "epoch": 1.0866164700734717,
1161
+ "grad_norm": 0.8548495769500732,
1162
+ "learning_rate": 0.00012912688256718182,
1163
+ "loss": 0.7462,
1164
+ "step": 14900
1165
+ },
1166
+ {
1167
+ "epoch": 1.0939089533463382,
1168
+ "grad_norm": 0.9326722025871277,
1169
+ "learning_rate": 0.00012863470814056501,
1170
+ "loss": 0.7515,
1171
+ "step": 15000
1172
+ },
1173
+ {
1174
+ "epoch": 1.0939089533463382,
1175
+ "eval_loss": 0.7423983812332153,
1176
+ "eval_runtime": 61.1091,
1177
+ "eval_samples_per_second": 146.541,
1178
+ "eval_steps_per_second": 18.328,
1179
+ "step": 15000
1180
+ },
1181
+ {
1182
+ "epoch": 1.1012014366192047,
1183
+ "grad_norm": 0.8803513646125793,
1184
+ "learning_rate": 0.00012814253371394823,
1185
+ "loss": 0.7369,
1186
+ "step": 15100
1187
+ },
1188
+ {
1189
+ "epoch": 1.1084939198920711,
1190
+ "grad_norm": 0.8555076122283936,
1191
+ "learning_rate": 0.00012765035928733145,
1192
+ "loss": 0.7414,
1193
+ "step": 15200
1194
+ },
1195
+ {
1196
+ "epoch": 1.1157864031649378,
1197
+ "grad_norm": 0.8760358691215515,
1198
+ "learning_rate": 0.00012715818486071464,
1199
+ "loss": 0.741,
1200
+ "step": 15300
1201
+ },
1202
+ {
1203
+ "epoch": 1.1230788864378043,
1204
+ "grad_norm": 0.8444579839706421,
1205
+ "learning_rate": 0.00012666601043409784,
1206
+ "loss": 0.7448,
1207
+ "step": 15400
1208
+ },
1209
+ {
1210
+ "epoch": 1.1303713697106708,
1211
+ "grad_norm": 0.8995528221130371,
1212
+ "learning_rate": 0.00012617383600748106,
1213
+ "loss": 0.7436,
1214
+ "step": 15500
1215
+ },
1216
+ {
1217
+ "epoch": 1.1376638529835372,
1218
+ "grad_norm": 0.8966475129127502,
1219
+ "learning_rate": 0.00012568166158086427,
1220
+ "loss": 0.7485,
1221
+ "step": 15600
1222
+ },
1223
+ {
1224
+ "epoch": 1.1449563362564037,
1225
+ "grad_norm": 0.8527953028678894,
1226
+ "learning_rate": 0.00012518948715424747,
1227
+ "loss": 0.7303,
1228
+ "step": 15700
1229
+ },
1230
+ {
1231
+ "epoch": 1.1522488195292702,
1232
+ "grad_norm": 0.8657513856887817,
1233
+ "learning_rate": 0.00012469731272763069,
1234
+ "loss": 0.7431,
1235
+ "step": 15800
1236
+ },
1237
+ {
1238
+ "epoch": 1.1595413028021366,
1239
+ "grad_norm": 0.8745185136795044,
1240
+ "learning_rate": 0.00012420513830101388,
1241
+ "loss": 0.7426,
1242
+ "step": 15900
1243
+ },
1244
+ {
1245
+ "epoch": 1.166833786075003,
1246
+ "grad_norm": 0.8729378581047058,
1247
+ "learning_rate": 0.0001237129638743971,
1248
+ "loss": 0.7389,
1249
+ "step": 16000
1250
+ },
1251
+ {
1252
+ "epoch": 1.166833786075003,
1253
+ "eval_loss": 0.740699291229248,
1254
+ "eval_runtime": 60.635,
1255
+ "eval_samples_per_second": 147.687,
1256
+ "eval_steps_per_second": 18.471,
1257
+ "step": 16000
1258
+ },
1259
+ {
1260
+ "epoch": 1.1741262693478696,
1261
+ "grad_norm": 0.8877021670341492,
1262
+ "learning_rate": 0.00012322078944778032,
1263
+ "loss": 0.7419,
1264
+ "step": 16100
1265
+ },
1266
+ {
1267
+ "epoch": 1.1814187526207363,
1268
+ "grad_norm": 0.9095293283462524,
1269
+ "learning_rate": 0.0001227286150211635,
1270
+ "loss": 0.7365,
1271
+ "step": 16200
1272
+ },
1273
+ {
1274
+ "epoch": 1.1887112358936027,
1275
+ "grad_norm": 0.8597880601882935,
1276
+ "learning_rate": 0.0001222364405945467,
1277
+ "loss": 0.7336,
1278
+ "step": 16300
1279
+ },
1280
+ {
1281
+ "epoch": 1.1960037191664692,
1282
+ "grad_norm": 0.9574359059333801,
1283
+ "learning_rate": 0.0001217442661679299,
1284
+ "loss": 0.7394,
1285
+ "step": 16400
1286
+ },
1287
+ {
1288
+ "epoch": 1.2032962024393357,
1289
+ "grad_norm": 0.8484875559806824,
1290
+ "learning_rate": 0.00012125209174131314,
1291
+ "loss": 0.7392,
1292
+ "step": 16500
1293
+ },
1294
+ {
1295
+ "epoch": 1.2105886857122021,
1296
+ "grad_norm": 0.8847618699073792,
1297
+ "learning_rate": 0.00012075991731469633,
1298
+ "loss": 0.7427,
1299
+ "step": 16600
1300
+ },
1301
+ {
1302
+ "epoch": 1.2178811689850686,
1303
+ "grad_norm": 0.8780632019042969,
1304
+ "learning_rate": 0.00012026774288807954,
1305
+ "loss": 0.7399,
1306
+ "step": 16700
1307
+ },
1308
+ {
1309
+ "epoch": 1.225173652257935,
1310
+ "grad_norm": 0.8698965311050415,
1311
+ "learning_rate": 0.00011977556846146274,
1312
+ "loss": 0.7395,
1313
+ "step": 16800
1314
+ },
1315
+ {
1316
+ "epoch": 1.2324661355308015,
1317
+ "grad_norm": 0.8717935085296631,
1318
+ "learning_rate": 0.00011928339403484596,
1319
+ "loss": 0.7404,
1320
+ "step": 16900
1321
+ },
1322
+ {
1323
+ "epoch": 1.239758618803668,
1324
+ "grad_norm": 0.8375683426856995,
1325
+ "learning_rate": 0.00011879121960822917,
1326
+ "loss": 0.7405,
1327
+ "step": 17000
1328
+ },
1329
+ {
1330
+ "epoch": 1.239758618803668,
1331
+ "eval_loss": 0.7371787428855896,
1332
+ "eval_runtime": 60.9373,
1333
+ "eval_samples_per_second": 146.954,
1334
+ "eval_steps_per_second": 18.38,
1335
+ "step": 17000
1336
+ },
1337
+ {
1338
+ "epoch": 1.2470511020765347,
1339
+ "grad_norm": 0.8756095170974731,
1340
+ "learning_rate": 0.00011829904518161237,
1341
+ "loss": 0.736,
1342
+ "step": 17100
1343
+ },
1344
+ {
1345
+ "epoch": 1.2543435853494012,
1346
+ "grad_norm": 0.8513076901435852,
1347
+ "learning_rate": 0.00011780687075499556,
1348
+ "loss": 0.7399,
1349
+ "step": 17200
1350
+ },
1351
+ {
1352
+ "epoch": 1.2616360686222676,
1353
+ "grad_norm": 0.8297843337059021,
1354
+ "learning_rate": 0.0001173146963283788,
1355
+ "loss": 0.7406,
1356
+ "step": 17300
1357
+ },
1358
+ {
1359
+ "epoch": 1.268928551895134,
1360
+ "grad_norm": 0.8896269202232361,
1361
+ "learning_rate": 0.00011682252190176199,
1362
+ "loss": 0.7346,
1363
+ "step": 17400
1364
+ },
1365
+ {
1366
+ "epoch": 1.2762210351680006,
1367
+ "grad_norm": 0.874168336391449,
1368
+ "learning_rate": 0.0001163303474751452,
1369
+ "loss": 0.736,
1370
+ "step": 17500
1371
+ },
1372
+ {
1373
+ "epoch": 1.283513518440867,
1374
+ "grad_norm": 0.9101394414901733,
1375
+ "learning_rate": 0.0001158381730485284,
1376
+ "loss": 0.7376,
1377
+ "step": 17600
1378
+ },
1379
+ {
1380
+ "epoch": 1.2908060017137335,
1381
+ "grad_norm": 0.9011333584785461,
1382
+ "learning_rate": 0.00011534599862191162,
1383
+ "loss": 0.7361,
1384
+ "step": 17700
1385
+ },
1386
+ {
1387
+ "epoch": 1.2980984849866002,
1388
+ "grad_norm": 0.8839349746704102,
1389
+ "learning_rate": 0.00011485382419529482,
1390
+ "loss": 0.7373,
1391
+ "step": 17800
1392
+ },
1393
+ {
1394
+ "epoch": 1.3053909682594664,
1395
+ "grad_norm": 0.830528974533081,
1396
+ "learning_rate": 0.00011436164976867803,
1397
+ "loss": 0.7336,
1398
+ "step": 17900
1399
+ },
1400
+ {
1401
+ "epoch": 1.3126834515323331,
1402
+ "grad_norm": 0.8777081370353699,
1403
+ "learning_rate": 0.00011386947534206122,
1404
+ "loss": 0.7379,
1405
+ "step": 18000
1406
+ },
1407
+ {
1408
+ "epoch": 1.3126834515323331,
1409
+ "eval_loss": 0.7359282970428467,
1410
+ "eval_runtime": 60.8023,
1411
+ "eval_samples_per_second": 147.281,
1412
+ "eval_steps_per_second": 18.42,
1413
+ "step": 18000
1414
+ },
1415
+ {
1416
+ "epoch": 1.3199759348051996,
1417
+ "grad_norm": 0.8853510022163391,
1418
+ "learning_rate": 0.00011337730091544443,
1419
+ "loss": 0.7376,
1420
+ "step": 18100
1421
+ },
1422
+ {
1423
+ "epoch": 1.327268418078066,
1424
+ "grad_norm": 0.9219810366630554,
1425
+ "learning_rate": 0.00011288512648882766,
1426
+ "loss": 0.7399,
1427
+ "step": 18200
1428
+ },
1429
+ {
1430
+ "epoch": 1.3345609013509325,
1431
+ "grad_norm": 0.9233282208442688,
1432
+ "learning_rate": 0.00011239295206221085,
1433
+ "loss": 0.7399,
1434
+ "step": 18300
1435
+ },
1436
+ {
1437
+ "epoch": 1.341853384623799,
1438
+ "grad_norm": 0.8359719514846802,
1439
+ "learning_rate": 0.00011190077763559406,
1440
+ "loss": 0.7366,
1441
+ "step": 18400
1442
+ },
1443
+ {
1444
+ "epoch": 1.3491458678966655,
1445
+ "grad_norm": 0.8673479557037354,
1446
+ "learning_rate": 0.00011140860320897726,
1447
+ "loss": 0.7398,
1448
+ "step": 18500
1449
+ },
1450
+ {
1451
+ "epoch": 1.356438351169532,
1452
+ "grad_norm": 0.8565610647201538,
1453
+ "learning_rate": 0.00011091642878236048,
1454
+ "loss": 0.7278,
1455
+ "step": 18600
1456
+ },
1457
+ {
1458
+ "epoch": 1.3637308344423986,
1459
+ "grad_norm": 0.8547226190567017,
1460
+ "learning_rate": 0.00011042425435574369,
1461
+ "loss": 0.7381,
1462
+ "step": 18700
1463
+ },
1464
+ {
1465
+ "epoch": 1.3710233177152649,
1466
+ "grad_norm": 0.897081732749939,
1467
+ "learning_rate": 0.00010993207992912688,
1468
+ "loss": 0.7339,
1469
+ "step": 18800
1470
+ },
1471
+ {
1472
+ "epoch": 1.3783158009881316,
1473
+ "grad_norm": 0.8852410912513733,
1474
+ "learning_rate": 0.00010943990550251008,
1475
+ "loss": 0.7342,
1476
+ "step": 18900
1477
+ },
1478
+ {
1479
+ "epoch": 1.385608284260998,
1480
+ "grad_norm": 0.9213690161705017,
1481
+ "learning_rate": 0.00010894773107589332,
1482
+ "loss": 0.7389,
1483
+ "step": 19000
1484
+ },
1485
+ {
1486
+ "epoch": 1.385608284260998,
1487
+ "eval_loss": 0.7335625886917114,
1488
+ "eval_runtime": 60.8231,
1489
+ "eval_samples_per_second": 147.23,
1490
+ "eval_steps_per_second": 18.414,
1491
+ "step": 19000
1492
+ },
1493
+ {
1494
+ "epoch": 1.3929007675338645,
1495
+ "grad_norm": 0.8398423790931702,
1496
+ "learning_rate": 0.00010845555664927651,
1497
+ "loss": 0.7274,
1498
+ "step": 19100
1499
+ },
1500
+ {
1501
+ "epoch": 1.400193250806731,
1502
+ "grad_norm": 0.8863806128501892,
1503
+ "learning_rate": 0.00010796338222265971,
1504
+ "loss": 0.7331,
1505
+ "step": 19200
1506
+ },
1507
+ {
1508
+ "epoch": 1.4074857340795974,
1509
+ "grad_norm": 0.8836521506309509,
1510
+ "learning_rate": 0.00010747120779604292,
1511
+ "loss": 0.7334,
1512
+ "step": 19300
1513
+ },
1514
+ {
1515
+ "epoch": 1.414778217352464,
1516
+ "grad_norm": 0.8278964757919312,
1517
+ "learning_rate": 0.00010697903336942614,
1518
+ "loss": 0.7281,
1519
+ "step": 19400
1520
+ },
1521
+ {
1522
+ "epoch": 1.4220707006253304,
1523
+ "grad_norm": 0.8681420087814331,
1524
+ "learning_rate": 0.00010648685894280934,
1525
+ "loss": 0.7345,
1526
+ "step": 19500
1527
+ },
1528
+ {
1529
+ "epoch": 1.429363183898197,
1530
+ "grad_norm": 0.8721694946289062,
1531
+ "learning_rate": 0.00010599468451619255,
1532
+ "loss": 0.7246,
1533
+ "step": 19600
1534
+ },
1535
+ {
1536
+ "epoch": 1.4366556671710633,
1537
+ "grad_norm": 0.8880037665367126,
1538
+ "learning_rate": 0.00010550251008957574,
1539
+ "loss": 0.7321,
1540
+ "step": 19700
1541
+ },
1542
+ {
1543
+ "epoch": 1.44394815044393,
1544
+ "grad_norm": 0.8522552251815796,
1545
+ "learning_rate": 0.00010501033566295895,
1546
+ "loss": 0.734,
1547
+ "step": 19800
1548
+ },
1549
+ {
1550
+ "epoch": 1.4512406337167965,
1551
+ "grad_norm": 0.8816943168640137,
1552
+ "learning_rate": 0.00010451816123634217,
1553
+ "loss": 0.7333,
1554
+ "step": 19900
1555
+ },
1556
+ {
1557
+ "epoch": 1.458533116989663,
1558
+ "grad_norm": 0.8068501949310303,
1559
+ "learning_rate": 0.00010402598680972537,
1560
+ "loss": 0.7267,
1561
+ "step": 20000
1562
+ },
1563
+ {
1564
+ "epoch": 1.458533116989663,
1565
+ "eval_loss": 0.731645405292511,
1566
+ "eval_runtime": 61.0998,
1567
+ "eval_samples_per_second": 146.563,
1568
+ "eval_steps_per_second": 18.331,
1569
+ "step": 20000
1570
+ },
1571
+ {
1572
+ "epoch": 1.4658256002625294,
1573
+ "grad_norm": 0.8473337888717651,
1574
+ "learning_rate": 0.00010353381238310858,
1575
+ "loss": 0.7328,
1576
+ "step": 20100
1577
+ },
1578
+ {
1579
+ "epoch": 1.4731180835353959,
1580
+ "grad_norm": 0.9009122252464294,
1581
+ "learning_rate": 0.00010304163795649177,
1582
+ "loss": 0.733,
1583
+ "step": 20200
1584
+ },
1585
+ {
1586
+ "epoch": 1.4804105668082623,
1587
+ "grad_norm": 0.8225035667419434,
1588
+ "learning_rate": 0.000102549463529875,
1589
+ "loss": 0.7311,
1590
+ "step": 20300
1591
+ },
1592
+ {
1593
+ "epoch": 1.4877030500811288,
1594
+ "grad_norm": 0.8552617430686951,
1595
+ "learning_rate": 0.00010205728910325821,
1596
+ "loss": 0.7282,
1597
+ "step": 20400
1598
+ },
1599
+ {
1600
+ "epoch": 1.4949955333539955,
1601
+ "grad_norm": 0.8690235614776611,
1602
+ "learning_rate": 0.0001015651146766414,
1603
+ "loss": 0.7329,
1604
+ "step": 20500
1605
+ },
1606
+ {
1607
+ "epoch": 1.5022880166268617,
1608
+ "grad_norm": 0.8566781878471375,
1609
+ "learning_rate": 0.0001010729402500246,
1610
+ "loss": 0.7358,
1611
+ "step": 20600
1612
+ },
1613
+ {
1614
+ "epoch": 1.5095804998997284,
1615
+ "grad_norm": 0.9174933433532715,
1616
+ "learning_rate": 0.00010058076582340782,
1617
+ "loss": 0.7266,
1618
+ "step": 20700
1619
+ },
1620
+ {
1621
+ "epoch": 1.516872983172595,
1622
+ "grad_norm": 0.9414506554603577,
1623
+ "learning_rate": 0.00010008859139679103,
1624
+ "loss": 0.7321,
1625
+ "step": 20800
1626
+ },
1627
+ {
1628
+ "epoch": 1.5241654664454614,
1629
+ "grad_norm": 0.9433586001396179,
1630
+ "learning_rate": 9.959641697017424e-05,
1631
+ "loss": 0.7355,
1632
+ "step": 20900
1633
+ },
1634
+ {
1635
+ "epoch": 1.5314579497183278,
1636
+ "grad_norm": 0.8544315695762634,
1637
+ "learning_rate": 9.910424254355744e-05,
1638
+ "loss": 0.7313,
1639
+ "step": 21000
1640
+ },
1641
+ {
1642
+ "epoch": 1.5314579497183278,
1643
+ "eval_loss": 0.7285299301147461,
1644
+ "eval_runtime": 60.6886,
1645
+ "eval_samples_per_second": 147.557,
1646
+ "eval_steps_per_second": 18.455,
1647
+ "step": 21000
1648
+ },
1649
+ {
1650
+ "epoch": 1.5387504329911943,
1651
+ "grad_norm": 0.893223762512207,
1652
+ "learning_rate": 9.861206811694065e-05,
1653
+ "loss": 0.7329,
1654
+ "step": 21100
1655
+ },
1656
+ {
1657
+ "epoch": 1.546042916264061,
1658
+ "grad_norm": 0.8868634104728699,
1659
+ "learning_rate": 9.811989369032387e-05,
1660
+ "loss": 0.7276,
1661
+ "step": 21200
1662
+ },
1663
+ {
1664
+ "epoch": 1.5533353995369272,
1665
+ "grad_norm": 0.8362566232681274,
1666
+ "learning_rate": 9.762771926370706e-05,
1667
+ "loss": 0.723,
1668
+ "step": 21300
1669
+ },
1670
+ {
1671
+ "epoch": 1.560627882809794,
1672
+ "grad_norm": 0.8852083086967468,
1673
+ "learning_rate": 9.713554483709026e-05,
1674
+ "loss": 0.7281,
1675
+ "step": 21400
1676
+ },
1677
+ {
1678
+ "epoch": 1.5679203660826602,
1679
+ "grad_norm": 0.8901813626289368,
1680
+ "learning_rate": 9.664337041047348e-05,
1681
+ "loss": 0.7307,
1682
+ "step": 21500
1683
+ },
1684
+ {
1685
+ "epoch": 1.5752128493555269,
1686
+ "grad_norm": 0.8210172057151794,
1687
+ "learning_rate": 9.615119598385667e-05,
1688
+ "loss": 0.7245,
1689
+ "step": 21600
1690
+ },
1691
+ {
1692
+ "epoch": 1.5825053326283933,
1693
+ "grad_norm": 0.8676414489746094,
1694
+ "learning_rate": 9.56590215572399e-05,
1695
+ "loss": 0.7294,
1696
+ "step": 21700
1697
+ },
1698
+ {
1699
+ "epoch": 1.5897978159012598,
1700
+ "grad_norm": 0.8923740983009338,
1701
+ "learning_rate": 9.51668471306231e-05,
1702
+ "loss": 0.7242,
1703
+ "step": 21800
1704
+ },
1705
+ {
1706
+ "epoch": 1.5970902991741263,
1707
+ "grad_norm": 0.8402920365333557,
1708
+ "learning_rate": 9.46746727040063e-05,
1709
+ "loss": 0.7258,
1710
+ "step": 21900
1711
+ },
1712
+ {
1713
+ "epoch": 1.6043827824469927,
1714
+ "grad_norm": 0.8525983691215515,
1715
+ "learning_rate": 9.418249827738951e-05,
1716
+ "loss": 0.7294,
1717
+ "step": 22000
1718
+ },
1719
+ {
1720
+ "epoch": 1.6043827824469927,
1721
+ "eval_loss": 0.7267495393753052,
1722
+ "eval_runtime": 61.1086,
1723
+ "eval_samples_per_second": 146.542,
1724
+ "eval_steps_per_second": 18.328,
1725
+ "step": 22000
1726
+ },
1727
+ {
1728
+ "epoch": 1.6116752657198594,
1729
+ "grad_norm": 0.8605002164840698,
1730
+ "learning_rate": 9.369032385077272e-05,
1731
+ "loss": 0.7259,
1732
+ "step": 22100
1733
+ },
1734
+ {
1735
+ "epoch": 1.6189677489927257,
1736
+ "grad_norm": 0.8606895208358765,
1737
+ "learning_rate": 9.319814942415592e-05,
1738
+ "loss": 0.7275,
1739
+ "step": 22200
1740
+ },
1741
+ {
1742
+ "epoch": 1.6262602322655924,
1743
+ "grad_norm": 0.8824227452278137,
1744
+ "learning_rate": 9.270597499753914e-05,
1745
+ "loss": 0.7245,
1746
+ "step": 22300
1747
+ },
1748
+ {
1749
+ "epoch": 1.6335527155384586,
1750
+ "grad_norm": 0.8670118451118469,
1751
+ "learning_rate": 9.221380057092233e-05,
1752
+ "loss": 0.719,
1753
+ "step": 22400
1754
+ },
1755
+ {
1756
+ "epoch": 1.6408451988113253,
1757
+ "grad_norm": 0.92063307762146,
1758
+ "learning_rate": 9.172162614430555e-05,
1759
+ "loss": 0.7293,
1760
+ "step": 22500
1761
+ },
1762
+ {
1763
+ "epoch": 1.6481376820841918,
1764
+ "grad_norm": 0.8425260782241821,
1765
+ "learning_rate": 9.122945171768876e-05,
1766
+ "loss": 0.728,
1767
+ "step": 22600
1768
+ },
1769
+ {
1770
+ "epoch": 1.6554301653570582,
1771
+ "grad_norm": 0.9162302017211914,
1772
+ "learning_rate": 9.073727729107196e-05,
1773
+ "loss": 0.7265,
1774
+ "step": 22700
1775
+ },
1776
+ {
1777
+ "epoch": 1.6627226486299247,
1778
+ "grad_norm": 0.8905067443847656,
1779
+ "learning_rate": 9.024510286445517e-05,
1780
+ "loss": 0.7256,
1781
+ "step": 22800
1782
+ },
1783
+ {
1784
+ "epoch": 1.6700151319027912,
1785
+ "grad_norm": 0.874357283115387,
1786
+ "learning_rate": 8.975292843783837e-05,
1787
+ "loss": 0.7249,
1788
+ "step": 22900
1789
+ },
1790
+ {
1791
+ "epoch": 1.6773076151756579,
1792
+ "grad_norm": 0.842005729675293,
1793
+ "learning_rate": 8.926075401122158e-05,
1794
+ "loss": 0.7268,
1795
+ "step": 23000
1796
+ },
1797
+ {
1798
+ "epoch": 1.6773076151756579,
1799
+ "eval_loss": 0.7241798639297485,
1800
+ "eval_runtime": 60.7958,
1801
+ "eval_samples_per_second": 147.296,
1802
+ "eval_steps_per_second": 18.422,
1803
+ "step": 23000
1804
+ },
1805
+ {
1806
+ "epoch": 1.684600098448524,
1807
+ "grad_norm": 0.8695193529129028,
1808
+ "learning_rate": 8.876857958460478e-05,
1809
+ "loss": 0.7262,
1810
+ "step": 23100
1811
+ },
1812
+ {
1813
+ "epoch": 1.6918925817213908,
1814
+ "grad_norm": 0.8673058748245239,
1815
+ "learning_rate": 8.827640515798799e-05,
1816
+ "loss": 0.7303,
1817
+ "step": 23200
1818
+ },
1819
+ {
1820
+ "epoch": 1.699185064994257,
1821
+ "grad_norm": 0.9276596307754517,
1822
+ "learning_rate": 8.77842307313712e-05,
1823
+ "loss": 0.729,
1824
+ "step": 23300
1825
+ },
1826
+ {
1827
+ "epoch": 1.7064775482671237,
1828
+ "grad_norm": 0.8023722171783447,
1829
+ "learning_rate": 8.729205630475441e-05,
1830
+ "loss": 0.7212,
1831
+ "step": 23400
1832
+ },
1833
+ {
1834
+ "epoch": 1.7137700315399902,
1835
+ "grad_norm": 0.910897433757782,
1836
+ "learning_rate": 8.67998818781376e-05,
1837
+ "loss": 0.7252,
1838
+ "step": 23500
1839
+ },
1840
+ {
1841
+ "epoch": 1.7210625148128567,
1842
+ "grad_norm": 0.8714926838874817,
1843
+ "learning_rate": 8.630770745152083e-05,
1844
+ "loss": 0.7306,
1845
+ "step": 23600
1846
+ },
1847
+ {
1848
+ "epoch": 1.7283549980857231,
1849
+ "grad_norm": 0.8875166773796082,
1850
+ "learning_rate": 8.581553302490403e-05,
1851
+ "loss": 0.7235,
1852
+ "step": 23700
1853
+ },
1854
+ {
1855
+ "epoch": 1.7356474813585896,
1856
+ "grad_norm": 0.9132345914840698,
1857
+ "learning_rate": 8.532335859828724e-05,
1858
+ "loss": 0.7331,
1859
+ "step": 23800
1860
+ },
1861
+ {
1862
+ "epoch": 1.7429399646314563,
1863
+ "grad_norm": 0.8562710285186768,
1864
+ "learning_rate": 8.483118417167044e-05,
1865
+ "loss": 0.7282,
1866
+ "step": 23900
1867
+ },
1868
+ {
1869
+ "epoch": 1.7502324479043225,
1870
+ "grad_norm": 0.867508590221405,
1871
+ "learning_rate": 8.433900974505365e-05,
1872
+ "loss": 0.7256,
1873
+ "step": 24000
1874
+ },
1875
+ {
1876
+ "epoch": 1.7502324479043225,
1877
+ "eval_loss": 0.7232645153999329,
1878
+ "eval_runtime": 60.377,
1879
+ "eval_samples_per_second": 148.318,
1880
+ "eval_steps_per_second": 18.55,
1881
+ "step": 24000
1882
+ },
1883
+ {
1884
+ "epoch": 1.7575249311771892,
1885
+ "grad_norm": 0.8258200287818909,
1886
+ "learning_rate": 8.384683531843685e-05,
1887
+ "loss": 0.7254,
1888
+ "step": 24100
1889
+ },
1890
+ {
1891
+ "epoch": 1.7648174144500555,
1892
+ "grad_norm": 0.9109018445014954,
1893
+ "learning_rate": 8.335466089182007e-05,
1894
+ "loss": 0.7315,
1895
+ "step": 24200
1896
+ },
1897
+ {
1898
+ "epoch": 1.7721098977229222,
1899
+ "grad_norm": 0.8500842452049255,
1900
+ "learning_rate": 8.286248646520326e-05,
1901
+ "loss": 0.7265,
1902
+ "step": 24300
1903
+ },
1904
+ {
1905
+ "epoch": 1.7794023809957886,
1906
+ "grad_norm": 0.9286713600158691,
1907
+ "learning_rate": 8.237031203858648e-05,
1908
+ "loss": 0.7247,
1909
+ "step": 24400
1910
+ },
1911
+ {
1912
+ "epoch": 1.786694864268655,
1913
+ "grad_norm": 0.8746926188468933,
1914
+ "learning_rate": 8.187813761196969e-05,
1915
+ "loss": 0.7261,
1916
+ "step": 24500
1917
+ },
1918
+ {
1919
+ "epoch": 1.7939873475415216,
1920
+ "grad_norm": 0.8702288866043091,
1921
+ "learning_rate": 8.13859631853529e-05,
1922
+ "loss": 0.7207,
1923
+ "step": 24600
1924
+ },
1925
+ {
1926
+ "epoch": 1.801279830814388,
1927
+ "grad_norm": 0.9746344089508057,
1928
+ "learning_rate": 8.08937887587361e-05,
1929
+ "loss": 0.728,
1930
+ "step": 24700
1931
+ },
1932
+ {
1933
+ "epoch": 1.8085723140872547,
1934
+ "grad_norm": 0.8815904259681702,
1935
+ "learning_rate": 8.04016143321193e-05,
1936
+ "loss": 0.7174,
1937
+ "step": 24800
1938
+ },
1939
+ {
1940
+ "epoch": 1.815864797360121,
1941
+ "grad_norm": 0.870474100112915,
1942
+ "learning_rate": 7.990943990550251e-05,
1943
+ "loss": 0.7316,
1944
+ "step": 24900
1945
+ },
1946
+ {
1947
+ "epoch": 1.8231572806329877,
1948
+ "grad_norm": 0.8451401591300964,
1949
+ "learning_rate": 7.941726547888572e-05,
1950
+ "loss": 0.7202,
1951
+ "step": 25000
1952
+ },
1953
+ {
1954
+ "epoch": 1.8231572806329877,
1955
+ "eval_loss": 0.721147358417511,
1956
+ "eval_runtime": 60.8906,
1957
+ "eval_samples_per_second": 147.067,
1958
+ "eval_steps_per_second": 18.394,
1959
+ "step": 25000
1960
+ },
1961
+ {
1962
+ "epoch": 1.830449763905854,
1963
+ "grad_norm": 0.8878180980682373,
1964
+ "learning_rate": 7.892509105226894e-05,
1965
+ "loss": 0.7236,
1966
+ "step": 25100
1967
+ },
1968
+ {
1969
+ "epoch": 1.8377422471787206,
1970
+ "grad_norm": 0.859920859336853,
1971
+ "learning_rate": 7.843291662565213e-05,
1972
+ "loss": 0.7257,
1973
+ "step": 25200
1974
+ },
1975
+ {
1976
+ "epoch": 1.845034730451587,
1977
+ "grad_norm": 0.9358228445053101,
1978
+ "learning_rate": 7.794074219903535e-05,
1979
+ "loss": 0.7175,
1980
+ "step": 25300
1981
+ },
1982
+ {
1983
+ "epoch": 1.8523272137244535,
1984
+ "grad_norm": 0.858906626701355,
1985
+ "learning_rate": 7.744856777241854e-05,
1986
+ "loss": 0.7217,
1987
+ "step": 25400
1988
+ },
1989
+ {
1990
+ "epoch": 1.85961969699732,
1991
+ "grad_norm": 0.9508287310600281,
1992
+ "learning_rate": 7.695639334580176e-05,
1993
+ "loss": 0.7211,
1994
+ "step": 25500
1995
+ },
1996
+ {
1997
+ "epoch": 1.8669121802701865,
1998
+ "grad_norm": 0.9340062141418457,
1999
+ "learning_rate": 7.646421891918496e-05,
2000
+ "loss": 0.7254,
2001
+ "step": 25600
2002
+ },
2003
+ {
2004
+ "epoch": 1.8742046635430532,
2005
+ "grad_norm": 0.9350687861442566,
2006
+ "learning_rate": 7.597204449256817e-05,
2007
+ "loss": 0.7247,
2008
+ "step": 25700
2009
+ },
2010
+ {
2011
+ "epoch": 1.8814971468159194,
2012
+ "grad_norm": 0.9614841938018799,
2013
+ "learning_rate": 7.547987006595137e-05,
2014
+ "loss": 0.7283,
2015
+ "step": 25800
2016
+ },
2017
+ {
2018
+ "epoch": 1.888789630088786,
2019
+ "grad_norm": 0.848640501499176,
2020
+ "learning_rate": 7.49876956393346e-05,
2021
+ "loss": 0.7221,
2022
+ "step": 25900
2023
+ },
2024
+ {
2025
+ "epoch": 1.8960821133616523,
2026
+ "grad_norm": 0.8105534315109253,
2027
+ "learning_rate": 7.449552121271779e-05,
2028
+ "loss": 0.7205,
2029
+ "step": 26000
2030
+ },
2031
+ {
2032
+ "epoch": 1.8960821133616523,
2033
+ "eval_loss": 0.7193262577056885,
2034
+ "eval_runtime": 61.1614,
2035
+ "eval_samples_per_second": 146.416,
2036
+ "eval_steps_per_second": 18.312,
2037
+ "step": 26000
2038
+ },
2039
+ {
2040
+ "epoch": 1.903374596634519,
2041
+ "grad_norm": 0.8522207736968994,
2042
+ "learning_rate": 7.4003346786101e-05,
2043
+ "loss": 0.7223,
2044
+ "step": 26100
2045
+ },
2046
+ {
2047
+ "epoch": 1.9106670799073855,
2048
+ "grad_norm": 0.8983740210533142,
2049
+ "learning_rate": 7.351117235948421e-05,
2050
+ "loss": 0.7208,
2051
+ "step": 26200
2052
+ },
2053
+ {
2054
+ "epoch": 1.917959563180252,
2055
+ "grad_norm": 0.8596473336219788,
2056
+ "learning_rate": 7.301899793286742e-05,
2057
+ "loss": 0.7184,
2058
+ "step": 26300
2059
+ },
2060
+ {
2061
+ "epoch": 1.9252520464531184,
2062
+ "grad_norm": 0.9175098538398743,
2063
+ "learning_rate": 7.252682350625062e-05,
2064
+ "loss": 0.7213,
2065
+ "step": 26400
2066
+ },
2067
+ {
2068
+ "epoch": 1.932544529725985,
2069
+ "grad_norm": 0.8626872897148132,
2070
+ "learning_rate": 7.203464907963383e-05,
2071
+ "loss": 0.7242,
2072
+ "step": 26500
2073
+ },
2074
+ {
2075
+ "epoch": 1.9398370129988516,
2076
+ "grad_norm": 0.859780490398407,
2077
+ "learning_rate": 7.154247465301703e-05,
2078
+ "loss": 0.7197,
2079
+ "step": 26600
2080
+ },
2081
+ {
2082
+ "epoch": 1.9471294962717178,
2083
+ "grad_norm": 0.8713703751564026,
2084
+ "learning_rate": 7.105030022640024e-05,
2085
+ "loss": 0.7231,
2086
+ "step": 26700
2087
+ },
2088
+ {
2089
+ "epoch": 1.9544219795445845,
2090
+ "grad_norm": 0.8976535797119141,
2091
+ "learning_rate": 7.055812579978344e-05,
2092
+ "loss": 0.7233,
2093
+ "step": 26800
2094
+ },
2095
+ {
2096
+ "epoch": 1.9617144628174508,
2097
+ "grad_norm": 0.9257802367210388,
2098
+ "learning_rate": 7.006595137316665e-05,
2099
+ "loss": 0.7221,
2100
+ "step": 26900
2101
+ },
2102
+ {
2103
+ "epoch": 1.9690069460903175,
2104
+ "grad_norm": 0.8592785596847534,
2105
+ "learning_rate": 6.957377694654987e-05,
2106
+ "loss": 0.7168,
2107
+ "step": 27000
2108
+ },
2109
+ {
2110
+ "epoch": 1.9690069460903175,
2111
+ "eval_loss": 0.7180259227752686,
2112
+ "eval_runtime": 60.5352,
2113
+ "eval_samples_per_second": 147.931,
2114
+ "eval_steps_per_second": 18.502,
2115
+ "step": 27000
2116
+ },
2117
+ {
2118
+ "epoch": 1.976299429363184,
2119
+ "grad_norm": 0.8931472897529602,
2120
+ "learning_rate": 6.908160251993306e-05,
2121
+ "loss": 0.7204,
2122
+ "step": 27100
2123
+ },
2124
+ {
2125
+ "epoch": 1.9835919126360504,
2126
+ "grad_norm": 0.8821597695350647,
2127
+ "learning_rate": 6.858942809331628e-05,
2128
+ "loss": 0.7163,
2129
+ "step": 27200
2130
+ },
2131
+ {
2132
+ "epoch": 1.9908843959089169,
2133
+ "grad_norm": 0.8749621510505676,
2134
+ "learning_rate": 6.809725366669948e-05,
2135
+ "loss": 0.711,
2136
+ "step": 27300
2137
+ },
2138
+ {
2139
+ "epoch": 1.9981768791817833,
2140
+ "grad_norm": 0.903332531452179,
2141
+ "learning_rate": 6.760507924008269e-05,
2142
+ "loss": 0.7176,
2143
+ "step": 27400
2144
+ },
2145
+ {
2146
+ "epoch": 2.005505824871014,
2147
+ "grad_norm": 0.854773759841919,
2148
+ "learning_rate": 6.71129048134659e-05,
2149
+ "loss": 0.7187,
2150
+ "step": 27500
2151
+ },
2152
+ {
2153
+ "epoch": 2.0127983081438807,
2154
+ "grad_norm": 0.9489893913269043,
2155
+ "learning_rate": 6.66207303868491e-05,
2156
+ "loss": 0.7096,
2157
+ "step": 27600
2158
+ },
2159
+ {
2160
+ "epoch": 2.020090791416747,
2161
+ "grad_norm": 0.8944621682167053,
2162
+ "learning_rate": 6.61285559602323e-05,
2163
+ "loss": 0.7104,
2164
+ "step": 27700
2165
+ },
2166
+ {
2167
+ "epoch": 2.0273832746896137,
2168
+ "grad_norm": 0.8567011952400208,
2169
+ "learning_rate": 6.563638153361553e-05,
2170
+ "loss": 0.7124,
2171
+ "step": 27800
2172
+ },
2173
+ {
2174
+ "epoch": 2.0346757579624803,
2175
+ "grad_norm": 0.8737155199050903,
2176
+ "learning_rate": 6.514420710699872e-05,
2177
+ "loss": 0.7127,
2178
+ "step": 27900
2179
+ },
2180
+ {
2181
+ "epoch": 2.0419682412353466,
2182
+ "grad_norm": 0.8935887813568115,
2183
+ "learning_rate": 6.465203268038194e-05,
2184
+ "loss": 0.7122,
2185
+ "step": 28000
2186
+ },
2187
+ {
2188
+ "epoch": 2.0419682412353466,
2189
+ "eval_loss": 0.716705858707428,
2190
+ "eval_runtime": 60.7739,
2191
+ "eval_samples_per_second": 147.349,
2192
+ "eval_steps_per_second": 18.429,
2193
+ "step": 28000
2194
+ },
2195
+ {
2196
+ "epoch": 2.0492607245082133,
2197
+ "grad_norm": 0.9452987313270569,
2198
+ "learning_rate": 6.415985825376514e-05,
2199
+ "loss": 0.7112,
2200
+ "step": 28100
2201
+ },
2202
+ {
2203
+ "epoch": 2.0565532077810795,
2204
+ "grad_norm": 0.8650675415992737,
2205
+ "learning_rate": 6.366768382714833e-05,
2206
+ "loss": 0.7079,
2207
+ "step": 28200
2208
+ },
2209
+ {
2210
+ "epoch": 2.063845691053946,
2211
+ "grad_norm": 0.8913034796714783,
2212
+ "learning_rate": 6.317550940053155e-05,
2213
+ "loss": 0.713,
2214
+ "step": 28300
2215
+ },
2216
+ {
2217
+ "epoch": 2.0711381743268125,
2218
+ "grad_norm": 0.9072710275650024,
2219
+ "learning_rate": 6.268333497391476e-05,
2220
+ "loss": 0.7094,
2221
+ "step": 28400
2222
+ },
2223
+ {
2224
+ "epoch": 2.078430657599679,
2225
+ "grad_norm": 0.854245126247406,
2226
+ "learning_rate": 6.219116054729796e-05,
2227
+ "loss": 0.7077,
2228
+ "step": 28500
2229
+ },
2230
+ {
2231
+ "epoch": 2.0857231408725454,
2232
+ "grad_norm": 0.929263174533844,
2233
+ "learning_rate": 6.169898612068117e-05,
2234
+ "loss": 0.7086,
2235
+ "step": 28600
2236
+ },
2237
+ {
2238
+ "epoch": 2.093015624145412,
2239
+ "grad_norm": 0.9356215596199036,
2240
+ "learning_rate": 6.120681169406438e-05,
2241
+ "loss": 0.7157,
2242
+ "step": 28700
2243
+ },
2244
+ {
2245
+ "epoch": 2.100308107418279,
2246
+ "grad_norm": 0.9242870211601257,
2247
+ "learning_rate": 6.071463726744758e-05,
2248
+ "loss": 0.71,
2249
+ "step": 28800
2250
+ },
2251
+ {
2252
+ "epoch": 2.107600590691145,
2253
+ "grad_norm": 0.9065095782279968,
2254
+ "learning_rate": 6.022246284083079e-05,
2255
+ "loss": 0.7095,
2256
+ "step": 28900
2257
+ },
2258
+ {
2259
+ "epoch": 2.1148930739640117,
2260
+ "grad_norm": 0.9081276059150696,
2261
+ "learning_rate": 5.9730288414214e-05,
2262
+ "loss": 0.7096,
2263
+ "step": 29000
2264
+ },
2265
+ {
2266
+ "epoch": 2.1148930739640117,
2267
+ "eval_loss": 0.7152244448661804,
2268
+ "eval_runtime": 60.7986,
2269
+ "eval_samples_per_second": 147.29,
2270
+ "eval_steps_per_second": 18.421,
2271
+ "step": 29000
2272
+ },
2273
+ {
2274
+ "epoch": 2.122185557236878,
2275
+ "grad_norm": 0.8326215744018555,
2276
+ "learning_rate": 5.923811398759721e-05,
2277
+ "loss": 0.7147,
2278
+ "step": 29100
2279
+ },
2280
+ {
2281
+ "epoch": 2.1294780405097447,
2282
+ "grad_norm": 0.9274723529815674,
2283
+ "learning_rate": 5.874593956098041e-05,
2284
+ "loss": 0.7111,
2285
+ "step": 29200
2286
+ },
2287
+ {
2288
+ "epoch": 2.136770523782611,
2289
+ "grad_norm": 0.8282331824302673,
2290
+ "learning_rate": 5.825376513436362e-05,
2291
+ "loss": 0.7137,
2292
+ "step": 29300
2293
+ },
2294
+ {
2295
+ "epoch": 2.1440630070554776,
2296
+ "grad_norm": 0.9081612229347229,
2297
+ "learning_rate": 5.776159070774683e-05,
2298
+ "loss": 0.7115,
2299
+ "step": 29400
2300
+ },
2301
+ {
2302
+ "epoch": 2.151355490328344,
2303
+ "grad_norm": 0.9531508684158325,
2304
+ "learning_rate": 5.726941628113004e-05,
2305
+ "loss": 0.708,
2306
+ "step": 29500
2307
+ },
2308
+ {
2309
+ "epoch": 2.1586479736012105,
2310
+ "grad_norm": 0.9125275611877441,
2311
+ "learning_rate": 5.677724185451324e-05,
2312
+ "loss": 0.7123,
2313
+ "step": 29600
2314
+ },
2315
+ {
2316
+ "epoch": 2.165940456874077,
2317
+ "grad_norm": 0.9363859295845032,
2318
+ "learning_rate": 5.628506742789645e-05,
2319
+ "loss": 0.7146,
2320
+ "step": 29700
2321
+ },
2322
+ {
2323
+ "epoch": 2.1732329401469435,
2324
+ "grad_norm": 0.9164854884147644,
2325
+ "learning_rate": 5.579289300127966e-05,
2326
+ "loss": 0.7121,
2327
+ "step": 29800
2328
+ },
2329
+ {
2330
+ "epoch": 2.18052542341981,
2331
+ "grad_norm": 0.941330075263977,
2332
+ "learning_rate": 5.530071857466287e-05,
2333
+ "loss": 0.7086,
2334
+ "step": 29900
2335
+ },
2336
+ {
2337
+ "epoch": 2.1878179066926764,
2338
+ "grad_norm": 0.9006567597389221,
2339
+ "learning_rate": 5.480854414804607e-05,
2340
+ "loss": 0.7097,
2341
+ "step": 30000
2342
+ },
2343
+ {
2344
+ "epoch": 2.1878179066926764,
2345
+ "eval_loss": 0.7143043875694275,
2346
+ "eval_runtime": 61.0555,
2347
+ "eval_samples_per_second": 146.67,
2348
+ "eval_steps_per_second": 18.344,
2349
+ "step": 30000
2350
+ },
2351
+ {
2352
+ "epoch": 2.195110389965543,
2353
+ "grad_norm": 0.8913944363594055,
2354
+ "learning_rate": 5.431636972142927e-05,
2355
+ "loss": 0.7066,
2356
+ "step": 30100
2357
+ },
2358
+ {
2359
+ "epoch": 2.2024028732384093,
2360
+ "grad_norm": 0.9200546145439148,
2361
+ "learning_rate": 5.3824195294812486e-05,
2362
+ "loss": 0.7076,
2363
+ "step": 30200
2364
+ },
2365
+ {
2366
+ "epoch": 2.209695356511276,
2367
+ "grad_norm": 0.924148440361023,
2368
+ "learning_rate": 5.3332020868195684e-05,
2369
+ "loss": 0.7058,
2370
+ "step": 30300
2371
+ },
2372
+ {
2373
+ "epoch": 2.2169878397841423,
2374
+ "grad_norm": 0.922255277633667,
2375
+ "learning_rate": 5.2839846441578897e-05,
2376
+ "loss": 0.7108,
2377
+ "step": 30400
2378
+ },
2379
+ {
2380
+ "epoch": 2.224280323057009,
2381
+ "grad_norm": 0.9039818644523621,
2382
+ "learning_rate": 5.23476720149621e-05,
2383
+ "loss": 0.7091,
2384
+ "step": 30500
2385
+ },
2386
+ {
2387
+ "epoch": 2.2315728063298756,
2388
+ "grad_norm": 0.963845431804657,
2389
+ "learning_rate": 5.1855497588345314e-05,
2390
+ "loss": 0.7065,
2391
+ "step": 30600
2392
+ },
2393
+ {
2394
+ "epoch": 2.238865289602742,
2395
+ "grad_norm": 0.8838880658149719,
2396
+ "learning_rate": 5.136332316172851e-05,
2397
+ "loss": 0.7113,
2398
+ "step": 30700
2399
+ },
2400
+ {
2401
+ "epoch": 2.2461577728756086,
2402
+ "grad_norm": 0.9642555117607117,
2403
+ "learning_rate": 5.0871148735111725e-05,
2404
+ "loss": 0.7062,
2405
+ "step": 30800
2406
+ },
2407
+ {
2408
+ "epoch": 2.253450256148475,
2409
+ "grad_norm": 0.9088276624679565,
2410
+ "learning_rate": 5.037897430849493e-05,
2411
+ "loss": 0.7071,
2412
+ "step": 30900
2413
+ },
2414
+ {
2415
+ "epoch": 2.2607427394213415,
2416
+ "grad_norm": 0.9083282351493835,
2417
+ "learning_rate": 4.9886799881878137e-05,
2418
+ "loss": 0.7126,
2419
+ "step": 31000
2420
+ },
2421
+ {
2422
+ "epoch": 2.2607427394213415,
2423
+ "eval_loss": 0.7129958868026733,
2424
+ "eval_runtime": 60.7821,
2425
+ "eval_samples_per_second": 147.33,
2426
+ "eval_steps_per_second": 18.426,
2427
+ "step": 31000
2428
+ },
2429
+ {
2430
+ "epoch": 2.2680352226942078,
2431
+ "grad_norm": 0.886710524559021,
2432
+ "learning_rate": 4.939462545526134e-05,
2433
+ "loss": 0.7043,
2434
+ "step": 31100
2435
+ },
2436
+ {
2437
+ "epoch": 2.2753277059670745,
2438
+ "grad_norm": 0.8600069880485535,
2439
+ "learning_rate": 4.8902451028644554e-05,
2440
+ "loss": 0.7074,
2441
+ "step": 31200
2442
+ },
2443
+ {
2444
+ "epoch": 2.2826201892399407,
2445
+ "grad_norm": 0.8897703289985657,
2446
+ "learning_rate": 4.841027660202776e-05,
2447
+ "loss": 0.7068,
2448
+ "step": 31300
2449
+ },
2450
+ {
2451
+ "epoch": 2.2899126725128074,
2452
+ "grad_norm": 0.8638718724250793,
2453
+ "learning_rate": 4.7918102175410965e-05,
2454
+ "loss": 0.7062,
2455
+ "step": 31400
2456
+ },
2457
+ {
2458
+ "epoch": 2.297205155785674,
2459
+ "grad_norm": 0.8973529934883118,
2460
+ "learning_rate": 4.742592774879418e-05,
2461
+ "loss": 0.7073,
2462
+ "step": 31500
2463
+ },
2464
+ {
2465
+ "epoch": 2.3044976390585403,
2466
+ "grad_norm": 0.9759765267372131,
2467
+ "learning_rate": 4.693375332217738e-05,
2468
+ "loss": 0.7087,
2469
+ "step": 31600
2470
+ },
2471
+ {
2472
+ "epoch": 2.311790122331407,
2473
+ "grad_norm": 0.9061428904533386,
2474
+ "learning_rate": 4.644157889556059e-05,
2475
+ "loss": 0.708,
2476
+ "step": 31700
2477
+ },
2478
+ {
2479
+ "epoch": 2.3190826056042733,
2480
+ "grad_norm": 0.8808257579803467,
2481
+ "learning_rate": 4.5949404468943794e-05,
2482
+ "loss": 0.7086,
2483
+ "step": 31800
2484
+ },
2485
+ {
2486
+ "epoch": 2.32637508887714,
2487
+ "grad_norm": 0.9116071462631226,
2488
+ "learning_rate": 4.545723004232701e-05,
2489
+ "loss": 0.7118,
2490
+ "step": 31900
2491
+ },
2492
+ {
2493
+ "epoch": 2.333667572150006,
2494
+ "grad_norm": 0.9131873846054077,
2495
+ "learning_rate": 4.496505561571021e-05,
2496
+ "loss": 0.7043,
2497
+ "step": 32000
2498
+ },
2499
+ {
2500
+ "epoch": 2.333667572150006,
2501
+ "eval_loss": 0.7112506031990051,
2502
+ "eval_runtime": 61.1535,
2503
+ "eval_samples_per_second": 146.435,
2504
+ "eval_steps_per_second": 18.315,
2505
+ "step": 32000
2506
+ },
2507
+ {
2508
+ "epoch": 2.340960055422873,
2509
+ "grad_norm": 0.9860331416130066,
2510
+ "learning_rate": 4.447288118909342e-05,
2511
+ "loss": 0.7063,
2512
+ "step": 32100
2513
+ },
2514
+ {
2515
+ "epoch": 2.348252538695739,
2516
+ "grad_norm": 0.933958888053894,
2517
+ "learning_rate": 4.398070676247662e-05,
2518
+ "loss": 0.708,
2519
+ "step": 32200
2520
+ },
2521
+ {
2522
+ "epoch": 2.355545021968606,
2523
+ "grad_norm": 0.8994225859642029,
2524
+ "learning_rate": 4.3488532335859836e-05,
2525
+ "loss": 0.7089,
2526
+ "step": 32300
2527
+ },
2528
+ {
2529
+ "epoch": 2.3628375052414725,
2530
+ "grad_norm": 0.9435915946960449,
2531
+ "learning_rate": 4.299635790924304e-05,
2532
+ "loss": 0.7057,
2533
+ "step": 32400
2534
+ },
2535
+ {
2536
+ "epoch": 2.3701299885143388,
2537
+ "grad_norm": 0.888438880443573,
2538
+ "learning_rate": 4.2504183482626247e-05,
2539
+ "loss": 0.7012,
2540
+ "step": 32500
2541
+ },
2542
+ {
2543
+ "epoch": 2.3774224717872054,
2544
+ "grad_norm": 0.8772885799407959,
2545
+ "learning_rate": 4.201200905600945e-05,
2546
+ "loss": 0.7071,
2547
+ "step": 32600
2548
+ },
2549
+ {
2550
+ "epoch": 2.3847149550600717,
2551
+ "grad_norm": 0.9333481788635254,
2552
+ "learning_rate": 4.151983462939266e-05,
2553
+ "loss": 0.7095,
2554
+ "step": 32700
2555
+ },
2556
+ {
2557
+ "epoch": 2.3920074383329384,
2558
+ "grad_norm": 0.9497707486152649,
2559
+ "learning_rate": 4.102766020277586e-05,
2560
+ "loss": 0.7115,
2561
+ "step": 32800
2562
+ },
2563
+ {
2564
+ "epoch": 2.3992999216058046,
2565
+ "grad_norm": 0.9641472697257996,
2566
+ "learning_rate": 4.053548577615907e-05,
2567
+ "loss": 0.712,
2568
+ "step": 32900
2569
+ },
2570
+ {
2571
+ "epoch": 2.4065924048786713,
2572
+ "grad_norm": 0.8958153128623962,
2573
+ "learning_rate": 4.004331134954228e-05,
2574
+ "loss": 0.7035,
2575
+ "step": 33000
2576
+ },
2577
+ {
2578
+ "epoch": 2.4065924048786713,
2579
+ "eval_loss": 0.7100856304168701,
2580
+ "eval_runtime": 61.2325,
2581
+ "eval_samples_per_second": 146.246,
2582
+ "eval_steps_per_second": 18.291,
2583
+ "step": 33000
2584
+ },
2585
+ {
2586
+ "epoch": 2.4138848881515376,
2587
+ "grad_norm": 0.8818393349647522,
2588
+ "learning_rate": 3.9551136922925487e-05,
2589
+ "loss": 0.7052,
2590
+ "step": 33100
2591
+ },
2592
+ {
2593
+ "epoch": 2.4211773714244043,
2594
+ "grad_norm": 0.8973012566566467,
2595
+ "learning_rate": 3.905896249630869e-05,
2596
+ "loss": 0.706,
2597
+ "step": 33200
2598
+ },
2599
+ {
2600
+ "epoch": 2.428469854697271,
2601
+ "grad_norm": 0.8582873344421387,
2602
+ "learning_rate": 3.85667880696919e-05,
2603
+ "loss": 0.7088,
2604
+ "step": 33300
2605
+ },
2606
+ {
2607
+ "epoch": 2.435762337970137,
2608
+ "grad_norm": 0.9306252002716064,
2609
+ "learning_rate": 3.807461364307511e-05,
2610
+ "loss": 0.7062,
2611
+ "step": 33400
2612
+ },
2613
+ {
2614
+ "epoch": 2.443054821243004,
2615
+ "grad_norm": 0.8586992025375366,
2616
+ "learning_rate": 3.7582439216458315e-05,
2617
+ "loss": 0.7086,
2618
+ "step": 33500
2619
+ },
2620
+ {
2621
+ "epoch": 2.45034730451587,
2622
+ "grad_norm": 0.9076369404792786,
2623
+ "learning_rate": 3.709026478984152e-05,
2624
+ "loss": 0.7052,
2625
+ "step": 33600
2626
+ },
2627
+ {
2628
+ "epoch": 2.457639787788737,
2629
+ "grad_norm": 0.8954334855079651,
2630
+ "learning_rate": 3.6598090363224727e-05,
2631
+ "loss": 0.7082,
2632
+ "step": 33700
2633
+ },
2634
+ {
2635
+ "epoch": 2.464932271061603,
2636
+ "grad_norm": 0.9315345287322998,
2637
+ "learning_rate": 3.610591593660794e-05,
2638
+ "loss": 0.7058,
2639
+ "step": 33800
2640
+ },
2641
+ {
2642
+ "epoch": 2.4722247543344698,
2643
+ "grad_norm": 0.9223620295524597,
2644
+ "learning_rate": 3.5613741509991144e-05,
2645
+ "loss": 0.6992,
2646
+ "step": 33900
2647
+ },
2648
+ {
2649
+ "epoch": 2.479517237607336,
2650
+ "grad_norm": 0.9349290132522583,
2651
+ "learning_rate": 3.512156708337435e-05,
2652
+ "loss": 0.7084,
2653
+ "step": 34000
2654
+ },
2655
+ {
2656
+ "epoch": 2.479517237607336,
2657
+ "eval_loss": 0.7087690234184265,
2658
+ "eval_runtime": 60.8859,
2659
+ "eval_samples_per_second": 147.078,
2660
+ "eval_steps_per_second": 18.395,
2661
+ "step": 34000
2662
+ },
2663
+ {
2664
+ "epoch": 2.4868097208802027,
2665
+ "grad_norm": 0.883210301399231,
2666
+ "learning_rate": 3.462939265675756e-05,
2667
+ "loss": 0.7061,
2668
+ "step": 34100
2669
+ },
2670
+ {
2671
+ "epoch": 2.4941022041530694,
2672
+ "grad_norm": 0.920868456363678,
2673
+ "learning_rate": 3.413721823014077e-05,
2674
+ "loss": 0.7069,
2675
+ "step": 34200
2676
+ },
2677
+ {
2678
+ "epoch": 2.5013946874259356,
2679
+ "grad_norm": 0.9177393913269043,
2680
+ "learning_rate": 3.3645043803523966e-05,
2681
+ "loss": 0.7071,
2682
+ "step": 34300
2683
+ },
2684
+ {
2685
+ "epoch": 2.5086871706988023,
2686
+ "grad_norm": 0.9114101529121399,
2687
+ "learning_rate": 3.315286937690717e-05,
2688
+ "loss": 0.7072,
2689
+ "step": 34400
2690
+ },
2691
+ {
2692
+ "epoch": 2.5159796539716686,
2693
+ "grad_norm": 0.9645174145698547,
2694
+ "learning_rate": 3.2660694950290384e-05,
2695
+ "loss": 0.7028,
2696
+ "step": 34500
2697
+ },
2698
+ {
2699
+ "epoch": 2.5232721372445353,
2700
+ "grad_norm": 0.8982295989990234,
2701
+ "learning_rate": 3.216852052367359e-05,
2702
+ "loss": 0.7085,
2703
+ "step": 34600
2704
+ },
2705
+ {
2706
+ "epoch": 2.530564620517402,
2707
+ "grad_norm": 0.8964338898658752,
2708
+ "learning_rate": 3.1676346097056795e-05,
2709
+ "loss": 0.7069,
2710
+ "step": 34700
2711
+ },
2712
+ {
2713
+ "epoch": 2.537857103790268,
2714
+ "grad_norm": 0.9609666466712952,
2715
+ "learning_rate": 3.118417167044001e-05,
2716
+ "loss": 0.7057,
2717
+ "step": 34800
2718
+ },
2719
+ {
2720
+ "epoch": 2.5451495870631344,
2721
+ "grad_norm": 0.9131038188934326,
2722
+ "learning_rate": 3.069199724382321e-05,
2723
+ "loss": 0.7031,
2724
+ "step": 34900
2725
+ },
2726
+ {
2727
+ "epoch": 2.552442070336001,
2728
+ "grad_norm": 0.9127321839332581,
2729
+ "learning_rate": 3.019982281720642e-05,
2730
+ "loss": 0.6979,
2731
+ "step": 35000
2732
+ },
2733
+ {
2734
+ "epoch": 2.552442070336001,
2735
+ "eval_loss": 0.7076790928840637,
2736
+ "eval_runtime": 61.0966,
2737
+ "eval_samples_per_second": 146.571,
2738
+ "eval_steps_per_second": 18.332,
2739
+ "step": 35000
2740
+ },
2741
+ {
2742
+ "epoch": 2.559734553608868,
2743
+ "grad_norm": 0.9567495584487915,
2744
+ "learning_rate": 2.9707648390589628e-05,
2745
+ "loss": 0.7053,
2746
+ "step": 35100
2747
+ },
2748
+ {
2749
+ "epoch": 2.567027036881734,
2750
+ "grad_norm": 0.9740573763847351,
2751
+ "learning_rate": 2.9215473963972833e-05,
2752
+ "loss": 0.7077,
2753
+ "step": 35200
2754
+ },
2755
+ {
2756
+ "epoch": 2.5743195201546007,
2757
+ "grad_norm": 0.8982974886894226,
2758
+ "learning_rate": 2.8723299537356042e-05,
2759
+ "loss": 0.6983,
2760
+ "step": 35300
2761
+ },
2762
+ {
2763
+ "epoch": 2.581612003427467,
2764
+ "grad_norm": 1.0185188055038452,
2765
+ "learning_rate": 2.8231125110739248e-05,
2766
+ "loss": 0.7069,
2767
+ "step": 35400
2768
+ },
2769
+ {
2770
+ "epoch": 2.5889044867003337,
2771
+ "grad_norm": 0.94049471616745,
2772
+ "learning_rate": 2.7738950684122457e-05,
2773
+ "loss": 0.7054,
2774
+ "step": 35500
2775
+ },
2776
+ {
2777
+ "epoch": 2.5961969699732004,
2778
+ "grad_norm": 0.8923749923706055,
2779
+ "learning_rate": 2.7246776257505662e-05,
2780
+ "loss": 0.7015,
2781
+ "step": 35600
2782
+ },
2783
+ {
2784
+ "epoch": 2.6034894532460666,
2785
+ "grad_norm": 0.9568887948989868,
2786
+ "learning_rate": 2.675460183088887e-05,
2787
+ "loss": 0.7025,
2788
+ "step": 35700
2789
+ },
2790
+ {
2791
+ "epoch": 2.610781936518933,
2792
+ "grad_norm": 0.9106321334838867,
2793
+ "learning_rate": 2.6262427404272077e-05,
2794
+ "loss": 0.7049,
2795
+ "step": 35800
2796
+ },
2797
+ {
2798
+ "epoch": 2.6180744197917996,
2799
+ "grad_norm": 0.9499268531799316,
2800
+ "learning_rate": 2.5770252977655285e-05,
2801
+ "loss": 0.7021,
2802
+ "step": 35900
2803
+ },
2804
+ {
2805
+ "epoch": 2.6253669030646662,
2806
+ "grad_norm": 0.8965421915054321,
2807
+ "learning_rate": 2.5278078551038488e-05,
2808
+ "loss": 0.7036,
2809
+ "step": 36000
2810
+ },
2811
+ {
2812
+ "epoch": 2.6253669030646662,
2813
+ "eval_loss": 0.7065343856811523,
2814
+ "eval_runtime": 61.0446,
2815
+ "eval_samples_per_second": 146.696,
2816
+ "eval_steps_per_second": 18.347,
2817
+ "step": 36000
2818
+ },
2819
+ {
2820
+ "epoch": 2.6326593863375325,
2821
+ "grad_norm": 0.94576096534729,
2822
+ "learning_rate": 2.4785904124421696e-05,
2823
+ "loss": 0.71,
2824
+ "step": 36100
2825
+ },
2826
+ {
2827
+ "epoch": 2.639951869610399,
2828
+ "grad_norm": 0.962692141532898,
2829
+ "learning_rate": 2.4293729697804905e-05,
2830
+ "loss": 0.6953,
2831
+ "step": 36200
2832
+ },
2833
+ {
2834
+ "epoch": 2.6472443528832654,
2835
+ "grad_norm": 0.9457094669342041,
2836
+ "learning_rate": 2.380155527118811e-05,
2837
+ "loss": 0.7011,
2838
+ "step": 36300
2839
+ },
2840
+ {
2841
+ "epoch": 2.654536836156132,
2842
+ "grad_norm": 0.9523045420646667,
2843
+ "learning_rate": 2.330938084457132e-05,
2844
+ "loss": 0.7093,
2845
+ "step": 36400
2846
+ },
2847
+ {
2848
+ "epoch": 2.661829319428999,
2849
+ "grad_norm": 0.9255204796791077,
2850
+ "learning_rate": 2.2817206417954522e-05,
2851
+ "loss": 0.6979,
2852
+ "step": 36500
2853
+ },
2854
+ {
2855
+ "epoch": 2.669121802701865,
2856
+ "grad_norm": 1.015286922454834,
2857
+ "learning_rate": 2.232503199133773e-05,
2858
+ "loss": 0.7044,
2859
+ "step": 36600
2860
+ },
2861
+ {
2862
+ "epoch": 2.6764142859747313,
2863
+ "grad_norm": 0.8911315202713013,
2864
+ "learning_rate": 2.1832857564720936e-05,
2865
+ "loss": 0.7031,
2866
+ "step": 36700
2867
+ },
2868
+ {
2869
+ "epoch": 2.683706769247598,
2870
+ "grad_norm": 0.9372689127922058,
2871
+ "learning_rate": 2.1340683138104145e-05,
2872
+ "loss": 0.7019,
2873
+ "step": 36800
2874
+ },
2875
+ {
2876
+ "epoch": 2.6909992525204647,
2877
+ "grad_norm": 0.9245051145553589,
2878
+ "learning_rate": 2.084850871148735e-05,
2879
+ "loss": 0.7065,
2880
+ "step": 36900
2881
+ },
2882
+ {
2883
+ "epoch": 2.698291735793331,
2884
+ "grad_norm": 0.917607843875885,
2885
+ "learning_rate": 2.035633428487056e-05,
2886
+ "loss": 0.7016,
2887
+ "step": 37000
2888
+ },
2889
+ {
2890
+ "epoch": 2.698291735793331,
2891
+ "eval_loss": 0.7054994702339172,
2892
+ "eval_runtime": 60.6541,
2893
+ "eval_samples_per_second": 147.64,
2894
+ "eval_steps_per_second": 18.465,
2895
+ "step": 37000
2896
+ },
2897
+ {
2898
+ "epoch": 2.7055842190661976,
2899
+ "grad_norm": 0.9054610729217529,
2900
+ "learning_rate": 1.9864159858253765e-05,
2901
+ "loss": 0.7034,
2902
+ "step": 37100
2903
+ },
2904
+ {
2905
+ "epoch": 2.712876702339064,
2906
+ "grad_norm": 0.960075855255127,
2907
+ "learning_rate": 1.9371985431636974e-05,
2908
+ "loss": 0.7097,
2909
+ "step": 37200
2910
+ },
2911
+ {
2912
+ "epoch": 2.7201691856119306,
2913
+ "grad_norm": 0.9454420208930969,
2914
+ "learning_rate": 1.887981100502018e-05,
2915
+ "loss": 0.7046,
2916
+ "step": 37300
2917
+ },
2918
+ {
2919
+ "epoch": 2.7274616688847972,
2920
+ "grad_norm": 0.8761453628540039,
2921
+ "learning_rate": 1.8387636578403385e-05,
2922
+ "loss": 0.7068,
2923
+ "step": 37400
2924
+ },
2925
+ {
2926
+ "epoch": 2.7347541521576635,
2927
+ "grad_norm": 0.9231957793235779,
2928
+ "learning_rate": 1.7895462151786594e-05,
2929
+ "loss": 0.6983,
2930
+ "step": 37500
2931
+ },
2932
+ {
2933
+ "epoch": 2.7420466354305297,
2934
+ "grad_norm": 0.8630309104919434,
2935
+ "learning_rate": 1.74032877251698e-05,
2936
+ "loss": 0.6984,
2937
+ "step": 37600
2938
+ },
2939
+ {
2940
+ "epoch": 2.7493391187033964,
2941
+ "grad_norm": 0.9077728986740112,
2942
+ "learning_rate": 1.691111329855301e-05,
2943
+ "loss": 0.7097,
2944
+ "step": 37700
2945
+ },
2946
+ {
2947
+ "epoch": 2.756631601976263,
2948
+ "grad_norm": 0.9849316477775574,
2949
+ "learning_rate": 1.6418938871936214e-05,
2950
+ "loss": 0.7025,
2951
+ "step": 37800
2952
+ },
2953
+ {
2954
+ "epoch": 2.7639240852491294,
2955
+ "grad_norm": 0.9101927280426025,
2956
+ "learning_rate": 1.5926764445319423e-05,
2957
+ "loss": 0.7127,
2958
+ "step": 37900
2959
+ },
2960
+ {
2961
+ "epoch": 2.771216568521996,
2962
+ "grad_norm": 0.9624613523483276,
2963
+ "learning_rate": 1.543459001870263e-05,
2964
+ "loss": 0.7038,
2965
+ "step": 38000
2966
+ },
2967
+ {
2968
+ "epoch": 2.771216568521996,
2969
+ "eval_loss": 0.7042670845985413,
2970
+ "eval_runtime": 60.6288,
2971
+ "eval_samples_per_second": 147.702,
2972
+ "eval_steps_per_second": 18.473,
2973
+ "step": 38000
2974
+ },
2975
+ {
2976
+ "epoch": 2.7785090517948623,
2977
+ "grad_norm": 0.8926946520805359,
2978
+ "learning_rate": 1.4942415592085838e-05,
2979
+ "loss": 0.6955,
2980
+ "step": 38100
2981
+ },
2982
+ {
2983
+ "epoch": 2.785801535067729,
2984
+ "grad_norm": 0.9353916645050049,
2985
+ "learning_rate": 1.4450241165469041e-05,
2986
+ "loss": 0.7003,
2987
+ "step": 38200
2988
+ },
2989
+ {
2990
+ "epoch": 2.7930940183405957,
2991
+ "grad_norm": 0.9394625425338745,
2992
+ "learning_rate": 1.3958066738852249e-05,
2993
+ "loss": 0.6963,
2994
+ "step": 38300
2995
+ },
2996
+ {
2997
+ "epoch": 2.800386501613462,
2998
+ "grad_norm": 0.8811284303665161,
2999
+ "learning_rate": 1.3465892312235456e-05,
3000
+ "loss": 0.7057,
3001
+ "step": 38400
3002
+ },
3003
+ {
3004
+ "epoch": 2.807678984886328,
3005
+ "grad_norm": 0.9111167788505554,
3006
+ "learning_rate": 1.2973717885618663e-05,
3007
+ "loss": 0.6905,
3008
+ "step": 38500
3009
+ },
3010
+ {
3011
+ "epoch": 2.814971468159195,
3012
+ "grad_norm": 0.9061198830604553,
3013
+ "learning_rate": 1.248154345900187e-05,
3014
+ "loss": 0.6966,
3015
+ "step": 38600
3016
+ },
3017
+ {
3018
+ "epoch": 2.8222639514320615,
3019
+ "grad_norm": 0.917921781539917,
3020
+ "learning_rate": 1.1989369032385078e-05,
3021
+ "loss": 0.7055,
3022
+ "step": 38700
3023
+ },
3024
+ {
3025
+ "epoch": 2.829556434704928,
3026
+ "grad_norm": 0.9210913777351379,
3027
+ "learning_rate": 1.1497194605768285e-05,
3028
+ "loss": 0.7004,
3029
+ "step": 38800
3030
+ },
3031
+ {
3032
+ "epoch": 2.8368489179777945,
3033
+ "grad_norm": 0.9152899384498596,
3034
+ "learning_rate": 1.1005020179151492e-05,
3035
+ "loss": 0.7065,
3036
+ "step": 38900
3037
+ },
3038
+ {
3039
+ "epoch": 2.8441414012506607,
3040
+ "grad_norm": 0.9237668514251709,
3041
+ "learning_rate": 1.05128457525347e-05,
3042
+ "loss": 0.7027,
3043
+ "step": 39000
3044
+ },
3045
+ {
3046
+ "epoch": 2.8441414012506607,
3047
+ "eval_loss": 0.7034493088722229,
3048
+ "eval_runtime": 60.6775,
3049
+ "eval_samples_per_second": 147.583,
3050
+ "eval_steps_per_second": 18.458,
3051
+ "step": 39000
3052
+ },
3053
+ {
3054
+ "epoch": 2.8514338845235274,
3055
+ "grad_norm": 0.9577778577804565,
3056
+ "learning_rate": 1.0020671325917906e-05,
3057
+ "loss": 0.7064,
3058
+ "step": 39100
3059
+ },
3060
+ {
3061
+ "epoch": 2.858726367796394,
3062
+ "grad_norm": 0.9955913424491882,
3063
+ "learning_rate": 9.528496899301114e-06,
3064
+ "loss": 0.7017,
3065
+ "step": 39200
3066
+ },
3067
+ {
3068
+ "epoch": 2.8660188510692604,
3069
+ "grad_norm": 0.9187660217285156,
3070
+ "learning_rate": 9.03632247268432e-06,
3071
+ "loss": 0.6998,
3072
+ "step": 39300
3073
+ },
3074
+ {
3075
+ "epoch": 2.8733113343421266,
3076
+ "grad_norm": 0.9275550842285156,
3077
+ "learning_rate": 8.544148046067526e-06,
3078
+ "loss": 0.7002,
3079
+ "step": 39400
3080
+ },
3081
+ {
3082
+ "epoch": 2.8806038176149933,
3083
+ "grad_norm": 0.9114721417427063,
3084
+ "learning_rate": 8.051973619450734e-06,
3085
+ "loss": 0.7027,
3086
+ "step": 39500
3087
+ },
3088
+ {
3089
+ "epoch": 2.88789630088786,
3090
+ "grad_norm": 0.9408327341079712,
3091
+ "learning_rate": 7.559799192833941e-06,
3092
+ "loss": 0.7034,
3093
+ "step": 39600
3094
+ },
3095
+ {
3096
+ "epoch": 2.8951887841607262,
3097
+ "grad_norm": 0.9538366198539734,
3098
+ "learning_rate": 7.067624766217147e-06,
3099
+ "loss": 0.7007,
3100
+ "step": 39700
3101
+ },
3102
+ {
3103
+ "epoch": 2.902481267433593,
3104
+ "grad_norm": 0.923864483833313,
3105
+ "learning_rate": 6.5754503396003544e-06,
3106
+ "loss": 0.6972,
3107
+ "step": 39800
3108
+ },
3109
+ {
3110
+ "epoch": 2.909773750706459,
3111
+ "grad_norm": 0.9156636595726013,
3112
+ "learning_rate": 6.083275912983562e-06,
3113
+ "loss": 0.7064,
3114
+ "step": 39900
3115
+ },
3116
+ {
3117
+ "epoch": 2.917066233979326,
3118
+ "grad_norm": 0.9568312168121338,
3119
+ "learning_rate": 5.591101486366768e-06,
3120
+ "loss": 0.6969,
3121
+ "step": 40000
3122
+ },
3123
+ {
3124
+ "epoch": 2.917066233979326,
3125
+ "eval_loss": 0.7027888894081116,
3126
+ "eval_runtime": 61.1155,
3127
+ "eval_samples_per_second": 146.526,
3128
+ "eval_steps_per_second": 18.326,
3129
+ "step": 40000
3130
+ },
3131
+ {
3132
+ "epoch": 2.9243587172521925,
3133
+ "grad_norm": 0.9376012086868286,
3134
+ "learning_rate": 5.098927059749975e-06,
3135
+ "loss": 0.7,
3136
+ "step": 40100
3137
+ },
3138
+ {
3139
+ "epoch": 2.931651200525059,
3140
+ "grad_norm": 0.9648913145065308,
3141
+ "learning_rate": 4.6067526331331825e-06,
3142
+ "loss": 0.7042,
3143
+ "step": 40200
3144
+ },
3145
+ {
3146
+ "epoch": 2.938943683797925,
3147
+ "grad_norm": 0.9452090263366699,
3148
+ "learning_rate": 4.11457820651639e-06,
3149
+ "loss": 0.7041,
3150
+ "step": 40300
3151
+ },
3152
+ {
3153
+ "epoch": 2.9462361670707917,
3154
+ "grad_norm": 0.9553784728050232,
3155
+ "learning_rate": 3.622403779899597e-06,
3156
+ "loss": 0.7005,
3157
+ "step": 40400
3158
+ },
3159
+ {
3160
+ "epoch": 2.9535286503436584,
3161
+ "grad_norm": 0.8788447380065918,
3162
+ "learning_rate": 3.1302293532828033e-06,
3163
+ "loss": 0.6974,
3164
+ "step": 40500
3165
+ },
3166
+ {
3167
+ "epoch": 2.9608211336165247,
3168
+ "grad_norm": 0.9146846532821655,
3169
+ "learning_rate": 2.6380549266660105e-06,
3170
+ "loss": 0.7004,
3171
+ "step": 40600
3172
+ },
3173
+ {
3174
+ "epoch": 2.9681136168893913,
3175
+ "grad_norm": 0.9674293398857117,
3176
+ "learning_rate": 2.1458805000492173e-06,
3177
+ "loss": 0.7028,
3178
+ "step": 40700
3179
+ },
3180
+ {
3181
+ "epoch": 2.9754061001622576,
3182
+ "grad_norm": 0.9374125599861145,
3183
+ "learning_rate": 1.6537060734324243e-06,
3184
+ "loss": 0.7008,
3185
+ "step": 40800
3186
+ },
3187
+ {
3188
+ "epoch": 2.9826985834351243,
3189
+ "grad_norm": 0.9554013013839722,
3190
+ "learning_rate": 1.1615316468156316e-06,
3191
+ "loss": 0.7011,
3192
+ "step": 40900
3193
+ },
3194
+ {
3195
+ "epoch": 2.989991066707991,
3196
+ "grad_norm": 0.8910831212997437,
3197
+ "learning_rate": 6.693572201988385e-07,
3198
+ "loss": 0.6992,
3199
+ "step": 41000
3200
+ },
3201
+ {
3202
+ "epoch": 2.989991066707991,
3203
+ "eval_loss": 0.7023043632507324,
3204
+ "eval_runtime": 61.2519,
3205
+ "eval_samples_per_second": 146.2,
3206
+ "eval_steps_per_second": 18.285,
3207
+ "step": 41000
3208
+ }
3209
+ ],
3210
+ "logging_steps": 100,
3211
+ "max_steps": 41136,
3212
+ "num_input_tokens_seen": 0,
3213
+ "num_train_epochs": 3,
3214
+ "save_steps": 1000,
3215
+ "stateful_callbacks": {
3216
+ "TrainerControl": {
3217
+ "args": {
3218
+ "should_epoch_stop": false,
3219
+ "should_evaluate": false,
3220
+ "should_log": false,
3221
+ "should_save": true,
3222
+ "should_training_stop": false
3223
+ },
3224
+ "attributes": {}
3225
+ }
3226
+ },
3227
+ "total_flos": 7.256060461056e+17,
3228
+ "train_batch_size": 8,
3229
+ "trial_name": null,
3230
+ "trial_params": null
3231
+ }
checkpoint-41000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ffa18fa243cccfbf729510f7d83fcb184f78dfbd7718a3073ec148d996a46094
3
+ size 5713
checkpoint-41136/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen2-0.5B-Instruct
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.13.2
checkpoint-41136/adapter_config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "Qwen/Qwen2-0.5B-Instruct",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 32,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 16,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "k_proj",
24
+ "q_proj",
25
+ "o_proj",
26
+ "v_proj"
27
+ ],
28
+ "task_type": "CAUSAL_LM",
29
+ "use_dora": false,
30
+ "use_rslora": false
31
+ }
checkpoint-41136/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:79d856104b781021c43fad86f0478030885797265c8d4fffb66447b5b720f4a7
3
+ size 8676008
checkpoint-41136/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b917ab4eb3f1678171bb7eaa0d0d28bebed4e333ef3afbe6e446040d02f3ad16
3
+ size 17463051
checkpoint-41136/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cec9bafeaad5c1a3d2ac3267c62cccb4920f46c34a30d0fe9af9cbf6364bd451
3
+ size 14645
checkpoint-41136/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0feaf77daa922912d1b993492fc3c7917a22158db4f0d3267724782a332eb520
3
+ size 1465
checkpoint-41136/trainer_state.json ADDED
@@ -0,0 +1,3238 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.9999088439590893,
5
+ "eval_steps": 1000,
6
+ "global_step": 41136,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.007292483272866493,
13
+ "grad_norm": 2.1235318183898926,
14
+ "learning_rate": 4e-05,
15
+ "loss": 2.7429,
16
+ "step": 100
17
+ },
18
+ {
19
+ "epoch": 0.014584966545732986,
20
+ "grad_norm": 1.9533482789993286,
21
+ "learning_rate": 8e-05,
22
+ "loss": 1.4786,
23
+ "step": 200
24
+ },
25
+ {
26
+ "epoch": 0.02187744981859948,
27
+ "grad_norm": 1.5908012390136719,
28
+ "learning_rate": 0.00012,
29
+ "loss": 1.252,
30
+ "step": 300
31
+ },
32
+ {
33
+ "epoch": 0.029169933091465972,
34
+ "grad_norm": 1.592781662940979,
35
+ "learning_rate": 0.00016,
36
+ "loss": 1.1674,
37
+ "step": 400
38
+ },
39
+ {
40
+ "epoch": 0.036462416364332464,
41
+ "grad_norm": 1.4071415662765503,
42
+ "learning_rate": 0.0002,
43
+ "loss": 1.101,
44
+ "step": 500
45
+ },
46
+ {
47
+ "epoch": 0.04375489963719896,
48
+ "grad_norm": 1.4228886365890503,
49
+ "learning_rate": 0.0001995078255733832,
50
+ "loss": 1.0487,
51
+ "step": 600
52
+ },
53
+ {
54
+ "epoch": 0.05104738291006545,
55
+ "grad_norm": 1.2705847024917603,
56
+ "learning_rate": 0.00019901565114676642,
57
+ "loss": 1.0119,
58
+ "step": 700
59
+ },
60
+ {
61
+ "epoch": 0.058339866182931945,
62
+ "grad_norm": 1.1770137548446655,
63
+ "learning_rate": 0.00019852347672014964,
64
+ "loss": 0.9906,
65
+ "step": 800
66
+ },
67
+ {
68
+ "epoch": 0.06563234945579843,
69
+ "grad_norm": 1.1681164503097534,
70
+ "learning_rate": 0.00019803130229353283,
71
+ "loss": 0.9645,
72
+ "step": 900
73
+ },
74
+ {
75
+ "epoch": 0.07292483272866493,
76
+ "grad_norm": 1.020504117012024,
77
+ "learning_rate": 0.00019753912786691605,
78
+ "loss": 0.9525,
79
+ "step": 1000
80
+ },
81
+ {
82
+ "epoch": 0.07292483272866493,
83
+ "eval_loss": 0.9407642483711243,
84
+ "eval_runtime": 61.0906,
85
+ "eval_samples_per_second": 146.586,
86
+ "eval_steps_per_second": 18.333,
87
+ "step": 1000
88
+ },
89
+ {
90
+ "epoch": 0.08021731600153142,
91
+ "grad_norm": 1.079444408416748,
92
+ "learning_rate": 0.00019704695344029924,
93
+ "loss": 0.9414,
94
+ "step": 1100
95
+ },
96
+ {
97
+ "epoch": 0.08750979927439792,
98
+ "grad_norm": 1.057377576828003,
99
+ "learning_rate": 0.00019655477901368246,
100
+ "loss": 0.9231,
101
+ "step": 1200
102
+ },
103
+ {
104
+ "epoch": 0.0948022825472644,
105
+ "grad_norm": 1.068018913269043,
106
+ "learning_rate": 0.00019606260458706568,
107
+ "loss": 0.9168,
108
+ "step": 1300
109
+ },
110
+ {
111
+ "epoch": 0.1020947658201309,
112
+ "grad_norm": 0.9460920095443726,
113
+ "learning_rate": 0.00019557043016044887,
114
+ "loss": 0.9031,
115
+ "step": 1400
116
+ },
117
+ {
118
+ "epoch": 0.1093872490929974,
119
+ "grad_norm": 1.056226134300232,
120
+ "learning_rate": 0.00019507825573383206,
121
+ "loss": 0.8901,
122
+ "step": 1500
123
+ },
124
+ {
125
+ "epoch": 0.11667973236586389,
126
+ "grad_norm": 1.0429835319519043,
127
+ "learning_rate": 0.00019458608130721528,
128
+ "loss": 0.8928,
129
+ "step": 1600
130
+ },
131
+ {
132
+ "epoch": 0.12397221563873038,
133
+ "grad_norm": 1.050790548324585,
134
+ "learning_rate": 0.0001940939068805985,
135
+ "loss": 0.8803,
136
+ "step": 1700
137
+ },
138
+ {
139
+ "epoch": 0.13126469891159687,
140
+ "grad_norm": 0.9586555361747742,
141
+ "learning_rate": 0.0001936017324539817,
142
+ "loss": 0.8809,
143
+ "step": 1800
144
+ },
145
+ {
146
+ "epoch": 0.13855718218446336,
147
+ "grad_norm": 0.985379159450531,
148
+ "learning_rate": 0.00019310955802736491,
149
+ "loss": 0.8743,
150
+ "step": 1900
151
+ },
152
+ {
153
+ "epoch": 0.14584966545732986,
154
+ "grad_norm": 0.9307010769844055,
155
+ "learning_rate": 0.00019261738360074813,
156
+ "loss": 0.8727,
157
+ "step": 2000
158
+ },
159
+ {
160
+ "epoch": 0.14584966545732986,
161
+ "eval_loss": 0.86456698179245,
162
+ "eval_runtime": 60.6283,
163
+ "eval_samples_per_second": 147.703,
164
+ "eval_steps_per_second": 18.473,
165
+ "step": 2000
166
+ },
167
+ {
168
+ "epoch": 0.15314214873019635,
169
+ "grad_norm": 1.0384063720703125,
170
+ "learning_rate": 0.00019212520917413133,
171
+ "loss": 0.8742,
172
+ "step": 2100
173
+ },
174
+ {
175
+ "epoch": 0.16043463200306285,
176
+ "grad_norm": 0.9662402868270874,
177
+ "learning_rate": 0.00019163303474751452,
178
+ "loss": 0.8661,
179
+ "step": 2200
180
+ },
181
+ {
182
+ "epoch": 0.16772711527592934,
183
+ "grad_norm": 0.9773098230361938,
184
+ "learning_rate": 0.00019114086032089774,
185
+ "loss": 0.8576,
186
+ "step": 2300
187
+ },
188
+ {
189
+ "epoch": 0.17501959854879584,
190
+ "grad_norm": 0.9672012329101562,
191
+ "learning_rate": 0.00019064868589428093,
192
+ "loss": 0.8595,
193
+ "step": 2400
194
+ },
195
+ {
196
+ "epoch": 0.1823120818216623,
197
+ "grad_norm": 0.9758124351501465,
198
+ "learning_rate": 0.00019015651146766415,
199
+ "loss": 0.8524,
200
+ "step": 2500
201
+ },
202
+ {
203
+ "epoch": 0.1896045650945288,
204
+ "grad_norm": 0.972232460975647,
205
+ "learning_rate": 0.00018966433704104737,
206
+ "loss": 0.8468,
207
+ "step": 2600
208
+ },
209
+ {
210
+ "epoch": 0.1968970483673953,
211
+ "grad_norm": 0.9417553544044495,
212
+ "learning_rate": 0.00018917216261443056,
213
+ "loss": 0.8412,
214
+ "step": 2700
215
+ },
216
+ {
217
+ "epoch": 0.2041895316402618,
218
+ "grad_norm": 0.9395071864128113,
219
+ "learning_rate": 0.00018867998818781375,
220
+ "loss": 0.8413,
221
+ "step": 2800
222
+ },
223
+ {
224
+ "epoch": 0.2114820149131283,
225
+ "grad_norm": 0.9951208233833313,
226
+ "learning_rate": 0.000188187813761197,
227
+ "loss": 0.8345,
228
+ "step": 2900
229
+ },
230
+ {
231
+ "epoch": 0.2187744981859948,
232
+ "grad_norm": 0.9656242728233337,
233
+ "learning_rate": 0.0001876956393345802,
234
+ "loss": 0.8317,
235
+ "step": 3000
236
+ },
237
+ {
238
+ "epoch": 0.2187744981859948,
239
+ "eval_loss": 0.8318613767623901,
240
+ "eval_runtime": 61.1356,
241
+ "eval_samples_per_second": 146.478,
242
+ "eval_steps_per_second": 18.32,
243
+ "step": 3000
244
+ },
245
+ {
246
+ "epoch": 0.22606698145886128,
247
+ "grad_norm": 0.8810185194015503,
248
+ "learning_rate": 0.00018720346490796338,
249
+ "loss": 0.8321,
250
+ "step": 3100
251
+ },
252
+ {
253
+ "epoch": 0.23335946473172778,
254
+ "grad_norm": 0.9199262857437134,
255
+ "learning_rate": 0.0001867112904813466,
256
+ "loss": 0.8406,
257
+ "step": 3200
258
+ },
259
+ {
260
+ "epoch": 0.24065194800459427,
261
+ "grad_norm": 0.9557051658630371,
262
+ "learning_rate": 0.00018621911605472982,
263
+ "loss": 0.8277,
264
+ "step": 3300
265
+ },
266
+ {
267
+ "epoch": 0.24794443127746077,
268
+ "grad_norm": 0.9777804017066956,
269
+ "learning_rate": 0.000185726941628113,
270
+ "loss": 0.8272,
271
+ "step": 3400
272
+ },
273
+ {
274
+ "epoch": 0.25523691455032727,
275
+ "grad_norm": 0.8856322169303894,
276
+ "learning_rate": 0.00018523476720149623,
277
+ "loss": 0.8256,
278
+ "step": 3500
279
+ },
280
+ {
281
+ "epoch": 0.26252939782319373,
282
+ "grad_norm": 0.9196017980575562,
283
+ "learning_rate": 0.00018474259277487942,
284
+ "loss": 0.8234,
285
+ "step": 3600
286
+ },
287
+ {
288
+ "epoch": 0.26982188109606026,
289
+ "grad_norm": 0.9568464159965515,
290
+ "learning_rate": 0.00018425041834826264,
291
+ "loss": 0.8193,
292
+ "step": 3700
293
+ },
294
+ {
295
+ "epoch": 0.2771143643689267,
296
+ "grad_norm": 0.9552770256996155,
297
+ "learning_rate": 0.00018375824392164583,
298
+ "loss": 0.8179,
299
+ "step": 3800
300
+ },
301
+ {
302
+ "epoch": 0.28440684764179325,
303
+ "grad_norm": 0.8997077345848083,
304
+ "learning_rate": 0.00018326606949502905,
305
+ "loss": 0.8138,
306
+ "step": 3900
307
+ },
308
+ {
309
+ "epoch": 0.2916993309146597,
310
+ "grad_norm": 0.8896480202674866,
311
+ "learning_rate": 0.00018277389506841224,
312
+ "loss": 0.8172,
313
+ "step": 4000
314
+ },
315
+ {
316
+ "epoch": 0.2916993309146597,
317
+ "eval_loss": 0.8123040199279785,
318
+ "eval_runtime": 60.7914,
319
+ "eval_samples_per_second": 147.307,
320
+ "eval_steps_per_second": 18.424,
321
+ "step": 4000
322
+ },
323
+ {
324
+ "epoch": 0.2989918141875262,
325
+ "grad_norm": 0.9520764350891113,
326
+ "learning_rate": 0.00018228172064179546,
327
+ "loss": 0.8183,
328
+ "step": 4100
329
+ },
330
+ {
331
+ "epoch": 0.3062842974603927,
332
+ "grad_norm": 0.9373065233230591,
333
+ "learning_rate": 0.00018178954621517868,
334
+ "loss": 0.8132,
335
+ "step": 4200
336
+ },
337
+ {
338
+ "epoch": 0.3135767807332592,
339
+ "grad_norm": 0.8733066916465759,
340
+ "learning_rate": 0.00018129737178856187,
341
+ "loss": 0.811,
342
+ "step": 4300
343
+ },
344
+ {
345
+ "epoch": 0.3208692640061257,
346
+ "grad_norm": 0.8866516351699829,
347
+ "learning_rate": 0.00018080519736194507,
348
+ "loss": 0.8093,
349
+ "step": 4400
350
+ },
351
+ {
352
+ "epoch": 0.32816174727899217,
353
+ "grad_norm": 0.9394953846931458,
354
+ "learning_rate": 0.00018031302293532828,
355
+ "loss": 0.8035,
356
+ "step": 4500
357
+ },
358
+ {
359
+ "epoch": 0.3354542305518587,
360
+ "grad_norm": 0.9133720993995667,
361
+ "learning_rate": 0.0001798208485087115,
362
+ "loss": 0.8054,
363
+ "step": 4600
364
+ },
365
+ {
366
+ "epoch": 0.34274671382472516,
367
+ "grad_norm": 0.9428606629371643,
368
+ "learning_rate": 0.0001793286740820947,
369
+ "loss": 0.8076,
370
+ "step": 4700
371
+ },
372
+ {
373
+ "epoch": 0.3500391970975917,
374
+ "grad_norm": 0.8996593356132507,
375
+ "learning_rate": 0.00017883649965547792,
376
+ "loss": 0.812,
377
+ "step": 4800
378
+ },
379
+ {
380
+ "epoch": 0.35733168037045815,
381
+ "grad_norm": 0.9113749265670776,
382
+ "learning_rate": 0.0001783443252288611,
383
+ "loss": 0.8048,
384
+ "step": 4900
385
+ },
386
+ {
387
+ "epoch": 0.3646241636433246,
388
+ "grad_norm": 0.9185646176338196,
389
+ "learning_rate": 0.00017785215080224433,
390
+ "loss": 0.8023,
391
+ "step": 5000
392
+ },
393
+ {
394
+ "epoch": 0.3646241636433246,
395
+ "eval_loss": 0.7973803877830505,
396
+ "eval_runtime": 60.8068,
397
+ "eval_samples_per_second": 147.27,
398
+ "eval_steps_per_second": 18.419,
399
+ "step": 5000
400
+ },
401
+ {
402
+ "epoch": 0.37191664691619114,
403
+ "grad_norm": 0.8994658589363098,
404
+ "learning_rate": 0.00017735997637562755,
405
+ "loss": 0.8089,
406
+ "step": 5100
407
+ },
408
+ {
409
+ "epoch": 0.3792091301890576,
410
+ "grad_norm": 0.8724523782730103,
411
+ "learning_rate": 0.00017686780194901074,
412
+ "loss": 0.8015,
413
+ "step": 5200
414
+ },
415
+ {
416
+ "epoch": 0.38650161346192413,
417
+ "grad_norm": 0.8285540342330933,
418
+ "learning_rate": 0.00017637562752239393,
419
+ "loss": 0.7944,
420
+ "step": 5300
421
+ },
422
+ {
423
+ "epoch": 0.3937940967347906,
424
+ "grad_norm": 0.8982509970664978,
425
+ "learning_rate": 0.00017588345309577718,
426
+ "loss": 0.7952,
427
+ "step": 5400
428
+ },
429
+ {
430
+ "epoch": 0.4010865800076571,
431
+ "grad_norm": 0.9266172051429749,
432
+ "learning_rate": 0.00017539127866916037,
433
+ "loss": 0.7978,
434
+ "step": 5500
435
+ },
436
+ {
437
+ "epoch": 0.4083790632805236,
438
+ "grad_norm": 0.901662290096283,
439
+ "learning_rate": 0.00017489910424254356,
440
+ "loss": 0.7966,
441
+ "step": 5600
442
+ },
443
+ {
444
+ "epoch": 0.4156715465533901,
445
+ "grad_norm": 0.9309051036834717,
446
+ "learning_rate": 0.00017440692981592678,
447
+ "loss": 0.7975,
448
+ "step": 5700
449
+ },
450
+ {
451
+ "epoch": 0.4229640298262566,
452
+ "grad_norm": 0.8789328336715698,
453
+ "learning_rate": 0.00017391475538930997,
454
+ "loss": 0.7997,
455
+ "step": 5800
456
+ },
457
+ {
458
+ "epoch": 0.4302565130991231,
459
+ "grad_norm": 0.8636139035224915,
460
+ "learning_rate": 0.0001734225809626932,
461
+ "loss": 0.7914,
462
+ "step": 5900
463
+ },
464
+ {
465
+ "epoch": 0.4375489963719896,
466
+ "grad_norm": 0.9468287229537964,
467
+ "learning_rate": 0.00017293040653607638,
468
+ "loss": 0.7859,
469
+ "step": 6000
470
+ },
471
+ {
472
+ "epoch": 0.4375489963719896,
473
+ "eval_loss": 0.7869976162910461,
474
+ "eval_runtime": 60.7741,
475
+ "eval_samples_per_second": 147.349,
476
+ "eval_steps_per_second": 18.429,
477
+ "step": 6000
478
+ },
479
+ {
480
+ "epoch": 0.44484147964485604,
481
+ "grad_norm": 0.867158055305481,
482
+ "learning_rate": 0.0001724382321094596,
483
+ "loss": 0.7924,
484
+ "step": 6100
485
+ },
486
+ {
487
+ "epoch": 0.45213396291772256,
488
+ "grad_norm": 0.9379836320877075,
489
+ "learning_rate": 0.0001719460576828428,
490
+ "loss": 0.7902,
491
+ "step": 6200
492
+ },
493
+ {
494
+ "epoch": 0.45942644619058903,
495
+ "grad_norm": 0.8591951727867126,
496
+ "learning_rate": 0.000171453883256226,
497
+ "loss": 0.7926,
498
+ "step": 6300
499
+ },
500
+ {
501
+ "epoch": 0.46671892946345556,
502
+ "grad_norm": 0.9702317118644714,
503
+ "learning_rate": 0.00017096170882960923,
504
+ "loss": 0.7867,
505
+ "step": 6400
506
+ },
507
+ {
508
+ "epoch": 0.474011412736322,
509
+ "grad_norm": 0.902302086353302,
510
+ "learning_rate": 0.00017046953440299242,
511
+ "loss": 0.7897,
512
+ "step": 6500
513
+ },
514
+ {
515
+ "epoch": 0.48130389600918855,
516
+ "grad_norm": 0.889926552772522,
517
+ "learning_rate": 0.00016997735997637561,
518
+ "loss": 0.7857,
519
+ "step": 6600
520
+ },
521
+ {
522
+ "epoch": 0.488596379282055,
523
+ "grad_norm": 0.8906420469284058,
524
+ "learning_rate": 0.00016948518554975886,
525
+ "loss": 0.7878,
526
+ "step": 6700
527
+ },
528
+ {
529
+ "epoch": 0.49588886255492154,
530
+ "grad_norm": 0.919983983039856,
531
+ "learning_rate": 0.00016899301112314205,
532
+ "loss": 0.7876,
533
+ "step": 6800
534
+ },
535
+ {
536
+ "epoch": 0.5031813458277881,
537
+ "grad_norm": 0.8610624670982361,
538
+ "learning_rate": 0.00016850083669652524,
539
+ "loss": 0.7923,
540
+ "step": 6900
541
+ },
542
+ {
543
+ "epoch": 0.5104738291006545,
544
+ "grad_norm": 0.9339637160301208,
545
+ "learning_rate": 0.00016800866226990846,
546
+ "loss": 0.7837,
547
+ "step": 7000
548
+ },
549
+ {
550
+ "epoch": 0.5104738291006545,
551
+ "eval_loss": 0.7791191935539246,
552
+ "eval_runtime": 60.8878,
553
+ "eval_samples_per_second": 147.074,
554
+ "eval_steps_per_second": 18.395,
555
+ "step": 7000
556
+ },
557
+ {
558
+ "epoch": 0.517766312373521,
559
+ "grad_norm": 0.9073446393013,
560
+ "learning_rate": 0.00016751648784329168,
561
+ "loss": 0.7809,
562
+ "step": 7100
563
+ },
564
+ {
565
+ "epoch": 0.5250587956463875,
566
+ "grad_norm": 0.9348235726356506,
567
+ "learning_rate": 0.00016702431341667487,
568
+ "loss": 0.7793,
569
+ "step": 7200
570
+ },
571
+ {
572
+ "epoch": 0.5323512789192539,
573
+ "grad_norm": 0.9155163168907166,
574
+ "learning_rate": 0.0001665321389900581,
575
+ "loss": 0.7821,
576
+ "step": 7300
577
+ },
578
+ {
579
+ "epoch": 0.5396437621921205,
580
+ "grad_norm": 0.9328250885009766,
581
+ "learning_rate": 0.00016603996456344129,
582
+ "loss": 0.7806,
583
+ "step": 7400
584
+ },
585
+ {
586
+ "epoch": 0.546936245464987,
587
+ "grad_norm": 0.8911275863647461,
588
+ "learning_rate": 0.00016554779013682448,
589
+ "loss": 0.7782,
590
+ "step": 7500
591
+ },
592
+ {
593
+ "epoch": 0.5542287287378534,
594
+ "grad_norm": 0.8989250659942627,
595
+ "learning_rate": 0.00016505561571020772,
596
+ "loss": 0.779,
597
+ "step": 7600
598
+ },
599
+ {
600
+ "epoch": 0.5615212120107199,
601
+ "grad_norm": 0.8869723081588745,
602
+ "learning_rate": 0.00016456344128359092,
603
+ "loss": 0.7822,
604
+ "step": 7700
605
+ },
606
+ {
607
+ "epoch": 0.5688136952835865,
608
+ "grad_norm": 0.8631371259689331,
609
+ "learning_rate": 0.0001640712668569741,
610
+ "loss": 0.7768,
611
+ "step": 7800
612
+ },
613
+ {
614
+ "epoch": 0.576106178556453,
615
+ "grad_norm": 0.8868420720100403,
616
+ "learning_rate": 0.00016357909243035733,
617
+ "loss": 0.7834,
618
+ "step": 7900
619
+ },
620
+ {
621
+ "epoch": 0.5833986618293194,
622
+ "grad_norm": 0.9253202080726624,
623
+ "learning_rate": 0.00016308691800374055,
624
+ "loss": 0.773,
625
+ "step": 8000
626
+ },
627
+ {
628
+ "epoch": 0.5833986618293194,
629
+ "eval_loss": 0.7733862400054932,
630
+ "eval_runtime": 60.8911,
631
+ "eval_samples_per_second": 147.066,
632
+ "eval_steps_per_second": 18.394,
633
+ "step": 8000
634
+ },
635
+ {
636
+ "epoch": 0.5906911451021859,
637
+ "grad_norm": 0.830760657787323,
638
+ "learning_rate": 0.00016259474357712374,
639
+ "loss": 0.7756,
640
+ "step": 8100
641
+ },
642
+ {
643
+ "epoch": 0.5979836283750524,
644
+ "grad_norm": 0.9371838569641113,
645
+ "learning_rate": 0.00016210256915050696,
646
+ "loss": 0.776,
647
+ "step": 8200
648
+ },
649
+ {
650
+ "epoch": 0.605276111647919,
651
+ "grad_norm": 0.8486947417259216,
652
+ "learning_rate": 0.00016161039472389015,
653
+ "loss": 0.7758,
654
+ "step": 8300
655
+ },
656
+ {
657
+ "epoch": 0.6125685949207854,
658
+ "grad_norm": 0.8888623118400574,
659
+ "learning_rate": 0.00016111822029727337,
660
+ "loss": 0.783,
661
+ "step": 8400
662
+ },
663
+ {
664
+ "epoch": 0.6198610781936519,
665
+ "grad_norm": 0.9176976084709167,
666
+ "learning_rate": 0.00016062604587065656,
667
+ "loss": 0.7782,
668
+ "step": 8500
669
+ },
670
+ {
671
+ "epoch": 0.6271535614665184,
672
+ "grad_norm": 0.90993732213974,
673
+ "learning_rate": 0.00016013387144403978,
674
+ "loss": 0.7741,
675
+ "step": 8600
676
+ },
677
+ {
678
+ "epoch": 0.6344460447393849,
679
+ "grad_norm": 0.8461544513702393,
680
+ "learning_rate": 0.00015964169701742297,
681
+ "loss": 0.7782,
682
+ "step": 8700
683
+ },
684
+ {
685
+ "epoch": 0.6417385280122514,
686
+ "grad_norm": 0.8642047643661499,
687
+ "learning_rate": 0.0001591495225908062,
688
+ "loss": 0.7706,
689
+ "step": 8800
690
+ },
691
+ {
692
+ "epoch": 0.6490310112851179,
693
+ "grad_norm": 0.8944571018218994,
694
+ "learning_rate": 0.0001586573481641894,
695
+ "loss": 0.7727,
696
+ "step": 8900
697
+ },
698
+ {
699
+ "epoch": 0.6563234945579843,
700
+ "grad_norm": 0.9075286984443665,
701
+ "learning_rate": 0.0001581651737375726,
702
+ "loss": 0.7748,
703
+ "step": 9000
704
+ },
705
+ {
706
+ "epoch": 0.6563234945579843,
707
+ "eval_loss": 0.7666329741477966,
708
+ "eval_runtime": 60.5924,
709
+ "eval_samples_per_second": 147.791,
710
+ "eval_steps_per_second": 18.484,
711
+ "step": 9000
712
+ },
713
+ {
714
+ "epoch": 0.6636159778308508,
715
+ "grad_norm": 0.9164955615997314,
716
+ "learning_rate": 0.0001576729993109558,
717
+ "loss": 0.7792,
718
+ "step": 9100
719
+ },
720
+ {
721
+ "epoch": 0.6709084611037174,
722
+ "grad_norm": 0.8446054458618164,
723
+ "learning_rate": 0.000157180824884339,
724
+ "loss": 0.7661,
725
+ "step": 9200
726
+ },
727
+ {
728
+ "epoch": 0.6782009443765838,
729
+ "grad_norm": 0.8793991804122925,
730
+ "learning_rate": 0.00015668865045772223,
731
+ "loss": 0.7678,
732
+ "step": 9300
733
+ },
734
+ {
735
+ "epoch": 0.6854934276494503,
736
+ "grad_norm": 0.8772592544555664,
737
+ "learning_rate": 0.00015619647603110542,
738
+ "loss": 0.7708,
739
+ "step": 9400
740
+ },
741
+ {
742
+ "epoch": 0.6927859109223168,
743
+ "grad_norm": 0.854118824005127,
744
+ "learning_rate": 0.00015570430160448864,
745
+ "loss": 0.7616,
746
+ "step": 9500
747
+ },
748
+ {
749
+ "epoch": 0.7000783941951834,
750
+ "grad_norm": 0.8653910756111145,
751
+ "learning_rate": 0.00015521212717787183,
752
+ "loss": 0.767,
753
+ "step": 9600
754
+ },
755
+ {
756
+ "epoch": 0.7073708774680498,
757
+ "grad_norm": 0.8890120387077332,
758
+ "learning_rate": 0.00015471995275125505,
759
+ "loss": 0.7657,
760
+ "step": 9700
761
+ },
762
+ {
763
+ "epoch": 0.7146633607409163,
764
+ "grad_norm": 0.8451828360557556,
765
+ "learning_rate": 0.00015422777832463827,
766
+ "loss": 0.7656,
767
+ "step": 9800
768
+ },
769
+ {
770
+ "epoch": 0.7219558440137828,
771
+ "grad_norm": 0.9029329419136047,
772
+ "learning_rate": 0.00015373560389802146,
773
+ "loss": 0.7749,
774
+ "step": 9900
775
+ },
776
+ {
777
+ "epoch": 0.7292483272866492,
778
+ "grad_norm": 0.8538834452629089,
779
+ "learning_rate": 0.00015324342947140466,
780
+ "loss": 0.763,
781
+ "step": 10000
782
+ },
783
+ {
784
+ "epoch": 0.7292483272866492,
785
+ "eval_loss": 0.76123046875,
786
+ "eval_runtime": 60.847,
787
+ "eval_samples_per_second": 147.172,
788
+ "eval_steps_per_second": 18.407,
789
+ "step": 10000
790
+ },
791
+ {
792
+ "epoch": 0.7365408105595158,
793
+ "grad_norm": 0.8594367504119873,
794
+ "learning_rate": 0.00015275125504478788,
795
+ "loss": 0.7693,
796
+ "step": 10100
797
+ },
798
+ {
799
+ "epoch": 0.7438332938323823,
800
+ "grad_norm": 0.8748040199279785,
801
+ "learning_rate": 0.0001522590806181711,
802
+ "loss": 0.7684,
803
+ "step": 10200
804
+ },
805
+ {
806
+ "epoch": 0.7511257771052487,
807
+ "grad_norm": 0.9177483320236206,
808
+ "learning_rate": 0.0001517669061915543,
809
+ "loss": 0.7599,
810
+ "step": 10300
811
+ },
812
+ {
813
+ "epoch": 0.7584182603781152,
814
+ "grad_norm": 0.8988757729530334,
815
+ "learning_rate": 0.0001512747317649375,
816
+ "loss": 0.7648,
817
+ "step": 10400
818
+ },
819
+ {
820
+ "epoch": 0.7657107436509818,
821
+ "grad_norm": 0.8735676407814026,
822
+ "learning_rate": 0.00015078255733832073,
823
+ "loss": 0.7656,
824
+ "step": 10500
825
+ },
826
+ {
827
+ "epoch": 0.7730032269238483,
828
+ "grad_norm": 0.8750614523887634,
829
+ "learning_rate": 0.00015029038291170392,
830
+ "loss": 0.7632,
831
+ "step": 10600
832
+ },
833
+ {
834
+ "epoch": 0.7802957101967147,
835
+ "grad_norm": 0.8786306381225586,
836
+ "learning_rate": 0.0001497982084850871,
837
+ "loss": 0.7659,
838
+ "step": 10700
839
+ },
840
+ {
841
+ "epoch": 0.7875881934695812,
842
+ "grad_norm": 0.811834990978241,
843
+ "learning_rate": 0.00014930603405847033,
844
+ "loss": 0.7652,
845
+ "step": 10800
846
+ },
847
+ {
848
+ "epoch": 0.7948806767424477,
849
+ "grad_norm": 0.8844282031059265,
850
+ "learning_rate": 0.00014881385963185352,
851
+ "loss": 0.7623,
852
+ "step": 10900
853
+ },
854
+ {
855
+ "epoch": 0.8021731600153142,
856
+ "grad_norm": 0.8444844484329224,
857
+ "learning_rate": 0.00014832168520523674,
858
+ "loss": 0.7622,
859
+ "step": 11000
860
+ },
861
+ {
862
+ "epoch": 0.8021731600153142,
863
+ "eval_loss": 0.75812828540802,
864
+ "eval_runtime": 60.7569,
865
+ "eval_samples_per_second": 147.391,
866
+ "eval_steps_per_second": 18.434,
867
+ "step": 11000
868
+ },
869
+ {
870
+ "epoch": 0.8094656432881807,
871
+ "grad_norm": 0.8396947979927063,
872
+ "learning_rate": 0.00014782951077861996,
873
+ "loss": 0.7673,
874
+ "step": 11100
875
+ },
876
+ {
877
+ "epoch": 0.8167581265610472,
878
+ "grad_norm": 0.8890758752822876,
879
+ "learning_rate": 0.00014733733635200315,
880
+ "loss": 0.7551,
881
+ "step": 11200
882
+ },
883
+ {
884
+ "epoch": 0.8240506098339136,
885
+ "grad_norm": 0.8038908839225769,
886
+ "learning_rate": 0.00014684516192538634,
887
+ "loss": 0.7612,
888
+ "step": 11300
889
+ },
890
+ {
891
+ "epoch": 0.8313430931067802,
892
+ "grad_norm": 0.8224745392799377,
893
+ "learning_rate": 0.0001463529874987696,
894
+ "loss": 0.7618,
895
+ "step": 11400
896
+ },
897
+ {
898
+ "epoch": 0.8386355763796467,
899
+ "grad_norm": 0.8691264390945435,
900
+ "learning_rate": 0.00014586081307215278,
901
+ "loss": 0.7618,
902
+ "step": 11500
903
+ },
904
+ {
905
+ "epoch": 0.8459280596525132,
906
+ "grad_norm": 0.8442777395248413,
907
+ "learning_rate": 0.00014536863864553597,
908
+ "loss": 0.7671,
909
+ "step": 11600
910
+ },
911
+ {
912
+ "epoch": 0.8532205429253796,
913
+ "grad_norm": 0.8520532846450806,
914
+ "learning_rate": 0.0001448764642189192,
915
+ "loss": 0.7625,
916
+ "step": 11700
917
+ },
918
+ {
919
+ "epoch": 0.8605130261982462,
920
+ "grad_norm": 0.908760666847229,
921
+ "learning_rate": 0.0001443842897923024,
922
+ "loss": 0.7615,
923
+ "step": 11800
924
+ },
925
+ {
926
+ "epoch": 0.8678055094711127,
927
+ "grad_norm": 0.8004080057144165,
928
+ "learning_rate": 0.0001438921153656856,
929
+ "loss": 0.7632,
930
+ "step": 11900
931
+ },
932
+ {
933
+ "epoch": 0.8750979927439791,
934
+ "grad_norm": 0.8449864983558655,
935
+ "learning_rate": 0.00014339994093906882,
936
+ "loss": 0.7574,
937
+ "step": 12000
938
+ },
939
+ {
940
+ "epoch": 0.8750979927439791,
941
+ "eval_loss": 0.752128005027771,
942
+ "eval_runtime": 61.1399,
943
+ "eval_samples_per_second": 146.467,
944
+ "eval_steps_per_second": 18.319,
945
+ "step": 12000
946
+ },
947
+ {
948
+ "epoch": 0.8823904760168456,
949
+ "grad_norm": 0.8218274116516113,
950
+ "learning_rate": 0.00014290776651245201,
951
+ "loss": 0.7555,
952
+ "step": 12100
953
+ },
954
+ {
955
+ "epoch": 0.8896829592897121,
956
+ "grad_norm": 0.8944920897483826,
957
+ "learning_rate": 0.00014241559208583523,
958
+ "loss": 0.7594,
959
+ "step": 12200
960
+ },
961
+ {
962
+ "epoch": 0.8969754425625787,
963
+ "grad_norm": 0.9254937767982483,
964
+ "learning_rate": 0.00014192341765921845,
965
+ "loss": 0.7598,
966
+ "step": 12300
967
+ },
968
+ {
969
+ "epoch": 0.9042679258354451,
970
+ "grad_norm": 0.8887091875076294,
971
+ "learning_rate": 0.00014143124323260164,
972
+ "loss": 0.7625,
973
+ "step": 12400
974
+ },
975
+ {
976
+ "epoch": 0.9115604091083116,
977
+ "grad_norm": 0.8478124737739563,
978
+ "learning_rate": 0.00014093906880598484,
979
+ "loss": 0.756,
980
+ "step": 12500
981
+ },
982
+ {
983
+ "epoch": 0.9188528923811781,
984
+ "grad_norm": 0.9377927780151367,
985
+ "learning_rate": 0.00014044689437936805,
986
+ "loss": 0.7606,
987
+ "step": 12600
988
+ },
989
+ {
990
+ "epoch": 0.9261453756540446,
991
+ "grad_norm": 0.838175892829895,
992
+ "learning_rate": 0.00013995471995275127,
993
+ "loss": 0.7605,
994
+ "step": 12700
995
+ },
996
+ {
997
+ "epoch": 0.9334378589269111,
998
+ "grad_norm": 0.8345216512680054,
999
+ "learning_rate": 0.00013946254552613447,
1000
+ "loss": 0.7568,
1001
+ "step": 12800
1002
+ },
1003
+ {
1004
+ "epoch": 0.9407303421997776,
1005
+ "grad_norm": 0.894477367401123,
1006
+ "learning_rate": 0.00013897037109951766,
1007
+ "loss": 0.7535,
1008
+ "step": 12900
1009
+ },
1010
+ {
1011
+ "epoch": 0.948022825472644,
1012
+ "grad_norm": 0.849010169506073,
1013
+ "learning_rate": 0.00013847819667290088,
1014
+ "loss": 0.7465,
1015
+ "step": 13000
1016
+ },
1017
+ {
1018
+ "epoch": 0.948022825472644,
1019
+ "eval_loss": 0.7492165565490723,
1020
+ "eval_runtime": 60.7079,
1021
+ "eval_samples_per_second": 147.51,
1022
+ "eval_steps_per_second": 18.449,
1023
+ "step": 13000
1024
+ },
1025
+ {
1026
+ "epoch": 0.9553153087455105,
1027
+ "grad_norm": 0.8754207491874695,
1028
+ "learning_rate": 0.0001379860222462841,
1029
+ "loss": 0.7576,
1030
+ "step": 13100
1031
+ },
1032
+ {
1033
+ "epoch": 0.9626077920183771,
1034
+ "grad_norm": 0.8984807133674622,
1035
+ "learning_rate": 0.0001374938478196673,
1036
+ "loss": 0.7493,
1037
+ "step": 13200
1038
+ },
1039
+ {
1040
+ "epoch": 0.9699002752912436,
1041
+ "grad_norm": 0.8458361029624939,
1042
+ "learning_rate": 0.0001370016733930505,
1043
+ "loss": 0.7468,
1044
+ "step": 13300
1045
+ },
1046
+ {
1047
+ "epoch": 0.97719275856411,
1048
+ "grad_norm": 0.9169609546661377,
1049
+ "learning_rate": 0.0001365094989664337,
1050
+ "loss": 0.7515,
1051
+ "step": 13400
1052
+ },
1053
+ {
1054
+ "epoch": 0.9844852418369765,
1055
+ "grad_norm": 0.8027638792991638,
1056
+ "learning_rate": 0.00013601732453981692,
1057
+ "loss": 0.7551,
1058
+ "step": 13500
1059
+ },
1060
+ {
1061
+ "epoch": 0.9917777251098431,
1062
+ "grad_norm": 0.8572927117347717,
1063
+ "learning_rate": 0.00013552515011320014,
1064
+ "loss": 0.7481,
1065
+ "step": 13600
1066
+ },
1067
+ {
1068
+ "epoch": 0.9990702083827095,
1069
+ "grad_norm": 0.8624053001403809,
1070
+ "learning_rate": 0.00013503297568658333,
1071
+ "loss": 0.7481,
1072
+ "step": 13700
1073
+ },
1074
+ {
1075
+ "epoch": 1.0063991540719404,
1076
+ "grad_norm": 0.8915347456932068,
1077
+ "learning_rate": 0.00013454080125996652,
1078
+ "loss": 0.7463,
1079
+ "step": 13800
1080
+ },
1081
+ {
1082
+ "epoch": 1.0136916373448068,
1083
+ "grad_norm": 0.8233557939529419,
1084
+ "learning_rate": 0.00013404862683334977,
1085
+ "loss": 0.7398,
1086
+ "step": 13900
1087
+ },
1088
+ {
1089
+ "epoch": 1.0209841206176733,
1090
+ "grad_norm": 0.8467598557472229,
1091
+ "learning_rate": 0.00013355645240673296,
1092
+ "loss": 0.7402,
1093
+ "step": 14000
1094
+ },
1095
+ {
1096
+ "epoch": 1.0209841206176733,
1097
+ "eval_loss": 0.7458442449569702,
1098
+ "eval_runtime": 60.6887,
1099
+ "eval_samples_per_second": 147.556,
1100
+ "eval_steps_per_second": 18.455,
1101
+ "step": 14000
1102
+ },
1103
+ {
1104
+ "epoch": 1.0282766038905398,
1105
+ "grad_norm": 0.852739691734314,
1106
+ "learning_rate": 0.00013306427798011615,
1107
+ "loss": 0.7436,
1108
+ "step": 14100
1109
+ },
1110
+ {
1111
+ "epoch": 1.0355690871634062,
1112
+ "grad_norm": 0.8501101136207581,
1113
+ "learning_rate": 0.00013257210355349937,
1114
+ "loss": 0.7472,
1115
+ "step": 14200
1116
+ },
1117
+ {
1118
+ "epoch": 1.0428615704362727,
1119
+ "grad_norm": 0.8830447793006897,
1120
+ "learning_rate": 0.0001320799291268826,
1121
+ "loss": 0.7438,
1122
+ "step": 14300
1123
+ },
1124
+ {
1125
+ "epoch": 1.0501540537091394,
1126
+ "grad_norm": 0.8827272057533264,
1127
+ "learning_rate": 0.00013158775470026578,
1128
+ "loss": 0.7439,
1129
+ "step": 14400
1130
+ },
1131
+ {
1132
+ "epoch": 1.0574465369820059,
1133
+ "grad_norm": 0.7875618934631348,
1134
+ "learning_rate": 0.000131095580273649,
1135
+ "loss": 0.7426,
1136
+ "step": 14500
1137
+ },
1138
+ {
1139
+ "epoch": 1.0647390202548723,
1140
+ "grad_norm": 0.9906949996948242,
1141
+ "learning_rate": 0.0001306034058470322,
1142
+ "loss": 0.7418,
1143
+ "step": 14600
1144
+ },
1145
+ {
1146
+ "epoch": 1.0720315035277388,
1147
+ "grad_norm": 0.8803852200508118,
1148
+ "learning_rate": 0.00013011123142041538,
1149
+ "loss": 0.7421,
1150
+ "step": 14700
1151
+ },
1152
+ {
1153
+ "epoch": 1.0793239868006053,
1154
+ "grad_norm": 0.8951194286346436,
1155
+ "learning_rate": 0.0001296190569937986,
1156
+ "loss": 0.7429,
1157
+ "step": 14800
1158
+ },
1159
+ {
1160
+ "epoch": 1.0866164700734717,
1161
+ "grad_norm": 0.8548495769500732,
1162
+ "learning_rate": 0.00012912688256718182,
1163
+ "loss": 0.7462,
1164
+ "step": 14900
1165
+ },
1166
+ {
1167
+ "epoch": 1.0939089533463382,
1168
+ "grad_norm": 0.9326722025871277,
1169
+ "learning_rate": 0.00012863470814056501,
1170
+ "loss": 0.7515,
1171
+ "step": 15000
1172
+ },
1173
+ {
1174
+ "epoch": 1.0939089533463382,
1175
+ "eval_loss": 0.7423983812332153,
1176
+ "eval_runtime": 61.1091,
1177
+ "eval_samples_per_second": 146.541,
1178
+ "eval_steps_per_second": 18.328,
1179
+ "step": 15000
1180
+ },
1181
+ {
1182
+ "epoch": 1.1012014366192047,
1183
+ "grad_norm": 0.8803513646125793,
1184
+ "learning_rate": 0.00012814253371394823,
1185
+ "loss": 0.7369,
1186
+ "step": 15100
1187
+ },
1188
+ {
1189
+ "epoch": 1.1084939198920711,
1190
+ "grad_norm": 0.8555076122283936,
1191
+ "learning_rate": 0.00012765035928733145,
1192
+ "loss": 0.7414,
1193
+ "step": 15200
1194
+ },
1195
+ {
1196
+ "epoch": 1.1157864031649378,
1197
+ "grad_norm": 0.8760358691215515,
1198
+ "learning_rate": 0.00012715818486071464,
1199
+ "loss": 0.741,
1200
+ "step": 15300
1201
+ },
1202
+ {
1203
+ "epoch": 1.1230788864378043,
1204
+ "grad_norm": 0.8444579839706421,
1205
+ "learning_rate": 0.00012666601043409784,
1206
+ "loss": 0.7448,
1207
+ "step": 15400
1208
+ },
1209
+ {
1210
+ "epoch": 1.1303713697106708,
1211
+ "grad_norm": 0.8995528221130371,
1212
+ "learning_rate": 0.00012617383600748106,
1213
+ "loss": 0.7436,
1214
+ "step": 15500
1215
+ },
1216
+ {
1217
+ "epoch": 1.1376638529835372,
1218
+ "grad_norm": 0.8966475129127502,
1219
+ "learning_rate": 0.00012568166158086427,
1220
+ "loss": 0.7485,
1221
+ "step": 15600
1222
+ },
1223
+ {
1224
+ "epoch": 1.1449563362564037,
1225
+ "grad_norm": 0.8527953028678894,
1226
+ "learning_rate": 0.00012518948715424747,
1227
+ "loss": 0.7303,
1228
+ "step": 15700
1229
+ },
1230
+ {
1231
+ "epoch": 1.1522488195292702,
1232
+ "grad_norm": 0.8657513856887817,
1233
+ "learning_rate": 0.00012469731272763069,
1234
+ "loss": 0.7431,
1235
+ "step": 15800
1236
+ },
1237
+ {
1238
+ "epoch": 1.1595413028021366,
1239
+ "grad_norm": 0.8745185136795044,
1240
+ "learning_rate": 0.00012420513830101388,
1241
+ "loss": 0.7426,
1242
+ "step": 15900
1243
+ },
1244
+ {
1245
+ "epoch": 1.166833786075003,
1246
+ "grad_norm": 0.8729378581047058,
1247
+ "learning_rate": 0.0001237129638743971,
1248
+ "loss": 0.7389,
1249
+ "step": 16000
1250
+ },
1251
+ {
1252
+ "epoch": 1.166833786075003,
1253
+ "eval_loss": 0.740699291229248,
1254
+ "eval_runtime": 60.635,
1255
+ "eval_samples_per_second": 147.687,
1256
+ "eval_steps_per_second": 18.471,
1257
+ "step": 16000
1258
+ },
1259
+ {
1260
+ "epoch": 1.1741262693478696,
1261
+ "grad_norm": 0.8877021670341492,
1262
+ "learning_rate": 0.00012322078944778032,
1263
+ "loss": 0.7419,
1264
+ "step": 16100
1265
+ },
1266
+ {
1267
+ "epoch": 1.1814187526207363,
1268
+ "grad_norm": 0.9095293283462524,
1269
+ "learning_rate": 0.0001227286150211635,
1270
+ "loss": 0.7365,
1271
+ "step": 16200
1272
+ },
1273
+ {
1274
+ "epoch": 1.1887112358936027,
1275
+ "grad_norm": 0.8597880601882935,
1276
+ "learning_rate": 0.0001222364405945467,
1277
+ "loss": 0.7336,
1278
+ "step": 16300
1279
+ },
1280
+ {
1281
+ "epoch": 1.1960037191664692,
1282
+ "grad_norm": 0.9574359059333801,
1283
+ "learning_rate": 0.0001217442661679299,
1284
+ "loss": 0.7394,
1285
+ "step": 16400
1286
+ },
1287
+ {
1288
+ "epoch": 1.2032962024393357,
1289
+ "grad_norm": 0.8484875559806824,
1290
+ "learning_rate": 0.00012125209174131314,
1291
+ "loss": 0.7392,
1292
+ "step": 16500
1293
+ },
1294
+ {
1295
+ "epoch": 1.2105886857122021,
1296
+ "grad_norm": 0.8847618699073792,
1297
+ "learning_rate": 0.00012075991731469633,
1298
+ "loss": 0.7427,
1299
+ "step": 16600
1300
+ },
1301
+ {
1302
+ "epoch": 1.2178811689850686,
1303
+ "grad_norm": 0.8780632019042969,
1304
+ "learning_rate": 0.00012026774288807954,
1305
+ "loss": 0.7399,
1306
+ "step": 16700
1307
+ },
1308
+ {
1309
+ "epoch": 1.225173652257935,
1310
+ "grad_norm": 0.8698965311050415,
1311
+ "learning_rate": 0.00011977556846146274,
1312
+ "loss": 0.7395,
1313
+ "step": 16800
1314
+ },
1315
+ {
1316
+ "epoch": 1.2324661355308015,
1317
+ "grad_norm": 0.8717935085296631,
1318
+ "learning_rate": 0.00011928339403484596,
1319
+ "loss": 0.7404,
1320
+ "step": 16900
1321
+ },
1322
+ {
1323
+ "epoch": 1.239758618803668,
1324
+ "grad_norm": 0.8375683426856995,
1325
+ "learning_rate": 0.00011879121960822917,
1326
+ "loss": 0.7405,
1327
+ "step": 17000
1328
+ },
1329
+ {
1330
+ "epoch": 1.239758618803668,
1331
+ "eval_loss": 0.7371787428855896,
1332
+ "eval_runtime": 60.9373,
1333
+ "eval_samples_per_second": 146.954,
1334
+ "eval_steps_per_second": 18.38,
1335
+ "step": 17000
1336
+ },
1337
+ {
1338
+ "epoch": 1.2470511020765347,
1339
+ "grad_norm": 0.8756095170974731,
1340
+ "learning_rate": 0.00011829904518161237,
1341
+ "loss": 0.736,
1342
+ "step": 17100
1343
+ },
1344
+ {
1345
+ "epoch": 1.2543435853494012,
1346
+ "grad_norm": 0.8513076901435852,
1347
+ "learning_rate": 0.00011780687075499556,
1348
+ "loss": 0.7399,
1349
+ "step": 17200
1350
+ },
1351
+ {
1352
+ "epoch": 1.2616360686222676,
1353
+ "grad_norm": 0.8297843337059021,
1354
+ "learning_rate": 0.0001173146963283788,
1355
+ "loss": 0.7406,
1356
+ "step": 17300
1357
+ },
1358
+ {
1359
+ "epoch": 1.268928551895134,
1360
+ "grad_norm": 0.8896269202232361,
1361
+ "learning_rate": 0.00011682252190176199,
1362
+ "loss": 0.7346,
1363
+ "step": 17400
1364
+ },
1365
+ {
1366
+ "epoch": 1.2762210351680006,
1367
+ "grad_norm": 0.874168336391449,
1368
+ "learning_rate": 0.0001163303474751452,
1369
+ "loss": 0.736,
1370
+ "step": 17500
1371
+ },
1372
+ {
1373
+ "epoch": 1.283513518440867,
1374
+ "grad_norm": 0.9101394414901733,
1375
+ "learning_rate": 0.0001158381730485284,
1376
+ "loss": 0.7376,
1377
+ "step": 17600
1378
+ },
1379
+ {
1380
+ "epoch": 1.2908060017137335,
1381
+ "grad_norm": 0.9011333584785461,
1382
+ "learning_rate": 0.00011534599862191162,
1383
+ "loss": 0.7361,
1384
+ "step": 17700
1385
+ },
1386
+ {
1387
+ "epoch": 1.2980984849866002,
1388
+ "grad_norm": 0.8839349746704102,
1389
+ "learning_rate": 0.00011485382419529482,
1390
+ "loss": 0.7373,
1391
+ "step": 17800
1392
+ },
1393
+ {
1394
+ "epoch": 1.3053909682594664,
1395
+ "grad_norm": 0.830528974533081,
1396
+ "learning_rate": 0.00011436164976867803,
1397
+ "loss": 0.7336,
1398
+ "step": 17900
1399
+ },
1400
+ {
1401
+ "epoch": 1.3126834515323331,
1402
+ "grad_norm": 0.8777081370353699,
1403
+ "learning_rate": 0.00011386947534206122,
1404
+ "loss": 0.7379,
1405
+ "step": 18000
1406
+ },
1407
+ {
1408
+ "epoch": 1.3126834515323331,
1409
+ "eval_loss": 0.7359282970428467,
1410
+ "eval_runtime": 60.8023,
1411
+ "eval_samples_per_second": 147.281,
1412
+ "eval_steps_per_second": 18.42,
1413
+ "step": 18000
1414
+ },
1415
+ {
1416
+ "epoch": 1.3199759348051996,
1417
+ "grad_norm": 0.8853510022163391,
1418
+ "learning_rate": 0.00011337730091544443,
1419
+ "loss": 0.7376,
1420
+ "step": 18100
1421
+ },
1422
+ {
1423
+ "epoch": 1.327268418078066,
1424
+ "grad_norm": 0.9219810366630554,
1425
+ "learning_rate": 0.00011288512648882766,
1426
+ "loss": 0.7399,
1427
+ "step": 18200
1428
+ },
1429
+ {
1430
+ "epoch": 1.3345609013509325,
1431
+ "grad_norm": 0.9233282208442688,
1432
+ "learning_rate": 0.00011239295206221085,
1433
+ "loss": 0.7399,
1434
+ "step": 18300
1435
+ },
1436
+ {
1437
+ "epoch": 1.341853384623799,
1438
+ "grad_norm": 0.8359719514846802,
1439
+ "learning_rate": 0.00011190077763559406,
1440
+ "loss": 0.7366,
1441
+ "step": 18400
1442
+ },
1443
+ {
1444
+ "epoch": 1.3491458678966655,
1445
+ "grad_norm": 0.8673479557037354,
1446
+ "learning_rate": 0.00011140860320897726,
1447
+ "loss": 0.7398,
1448
+ "step": 18500
1449
+ },
1450
+ {
1451
+ "epoch": 1.356438351169532,
1452
+ "grad_norm": 0.8565610647201538,
1453
+ "learning_rate": 0.00011091642878236048,
1454
+ "loss": 0.7278,
1455
+ "step": 18600
1456
+ },
1457
+ {
1458
+ "epoch": 1.3637308344423986,
1459
+ "grad_norm": 0.8547226190567017,
1460
+ "learning_rate": 0.00011042425435574369,
1461
+ "loss": 0.7381,
1462
+ "step": 18700
1463
+ },
1464
+ {
1465
+ "epoch": 1.3710233177152649,
1466
+ "grad_norm": 0.897081732749939,
1467
+ "learning_rate": 0.00010993207992912688,
1468
+ "loss": 0.7339,
1469
+ "step": 18800
1470
+ },
1471
+ {
1472
+ "epoch": 1.3783158009881316,
1473
+ "grad_norm": 0.8852410912513733,
1474
+ "learning_rate": 0.00010943990550251008,
1475
+ "loss": 0.7342,
1476
+ "step": 18900
1477
+ },
1478
+ {
1479
+ "epoch": 1.385608284260998,
1480
+ "grad_norm": 0.9213690161705017,
1481
+ "learning_rate": 0.00010894773107589332,
1482
+ "loss": 0.7389,
1483
+ "step": 19000
1484
+ },
1485
+ {
1486
+ "epoch": 1.385608284260998,
1487
+ "eval_loss": 0.7335625886917114,
1488
+ "eval_runtime": 60.8231,
1489
+ "eval_samples_per_second": 147.23,
1490
+ "eval_steps_per_second": 18.414,
1491
+ "step": 19000
1492
+ },
1493
+ {
1494
+ "epoch": 1.3929007675338645,
1495
+ "grad_norm": 0.8398423790931702,
1496
+ "learning_rate": 0.00010845555664927651,
1497
+ "loss": 0.7274,
1498
+ "step": 19100
1499
+ },
1500
+ {
1501
+ "epoch": 1.400193250806731,
1502
+ "grad_norm": 0.8863806128501892,
1503
+ "learning_rate": 0.00010796338222265971,
1504
+ "loss": 0.7331,
1505
+ "step": 19200
1506
+ },
1507
+ {
1508
+ "epoch": 1.4074857340795974,
1509
+ "grad_norm": 0.8836521506309509,
1510
+ "learning_rate": 0.00010747120779604292,
1511
+ "loss": 0.7334,
1512
+ "step": 19300
1513
+ },
1514
+ {
1515
+ "epoch": 1.414778217352464,
1516
+ "grad_norm": 0.8278964757919312,
1517
+ "learning_rate": 0.00010697903336942614,
1518
+ "loss": 0.7281,
1519
+ "step": 19400
1520
+ },
1521
+ {
1522
+ "epoch": 1.4220707006253304,
1523
+ "grad_norm": 0.8681420087814331,
1524
+ "learning_rate": 0.00010648685894280934,
1525
+ "loss": 0.7345,
1526
+ "step": 19500
1527
+ },
1528
+ {
1529
+ "epoch": 1.429363183898197,
1530
+ "grad_norm": 0.8721694946289062,
1531
+ "learning_rate": 0.00010599468451619255,
1532
+ "loss": 0.7246,
1533
+ "step": 19600
1534
+ },
1535
+ {
1536
+ "epoch": 1.4366556671710633,
1537
+ "grad_norm": 0.8880037665367126,
1538
+ "learning_rate": 0.00010550251008957574,
1539
+ "loss": 0.7321,
1540
+ "step": 19700
1541
+ },
1542
+ {
1543
+ "epoch": 1.44394815044393,
1544
+ "grad_norm": 0.8522552251815796,
1545
+ "learning_rate": 0.00010501033566295895,
1546
+ "loss": 0.734,
1547
+ "step": 19800
1548
+ },
1549
+ {
1550
+ "epoch": 1.4512406337167965,
1551
+ "grad_norm": 0.8816943168640137,
1552
+ "learning_rate": 0.00010451816123634217,
1553
+ "loss": 0.7333,
1554
+ "step": 19900
1555
+ },
1556
+ {
1557
+ "epoch": 1.458533116989663,
1558
+ "grad_norm": 0.8068501949310303,
1559
+ "learning_rate": 0.00010402598680972537,
1560
+ "loss": 0.7267,
1561
+ "step": 20000
1562
+ },
1563
+ {
1564
+ "epoch": 1.458533116989663,
1565
+ "eval_loss": 0.731645405292511,
1566
+ "eval_runtime": 61.0998,
1567
+ "eval_samples_per_second": 146.563,
1568
+ "eval_steps_per_second": 18.331,
1569
+ "step": 20000
1570
+ },
1571
+ {
1572
+ "epoch": 1.4658256002625294,
1573
+ "grad_norm": 0.8473337888717651,
1574
+ "learning_rate": 0.00010353381238310858,
1575
+ "loss": 0.7328,
1576
+ "step": 20100
1577
+ },
1578
+ {
1579
+ "epoch": 1.4731180835353959,
1580
+ "grad_norm": 0.9009122252464294,
1581
+ "learning_rate": 0.00010304163795649177,
1582
+ "loss": 0.733,
1583
+ "step": 20200
1584
+ },
1585
+ {
1586
+ "epoch": 1.4804105668082623,
1587
+ "grad_norm": 0.8225035667419434,
1588
+ "learning_rate": 0.000102549463529875,
1589
+ "loss": 0.7311,
1590
+ "step": 20300
1591
+ },
1592
+ {
1593
+ "epoch": 1.4877030500811288,
1594
+ "grad_norm": 0.8552617430686951,
1595
+ "learning_rate": 0.00010205728910325821,
1596
+ "loss": 0.7282,
1597
+ "step": 20400
1598
+ },
1599
+ {
1600
+ "epoch": 1.4949955333539955,
1601
+ "grad_norm": 0.8690235614776611,
1602
+ "learning_rate": 0.0001015651146766414,
1603
+ "loss": 0.7329,
1604
+ "step": 20500
1605
+ },
1606
+ {
1607
+ "epoch": 1.5022880166268617,
1608
+ "grad_norm": 0.8566781878471375,
1609
+ "learning_rate": 0.0001010729402500246,
1610
+ "loss": 0.7358,
1611
+ "step": 20600
1612
+ },
1613
+ {
1614
+ "epoch": 1.5095804998997284,
1615
+ "grad_norm": 0.9174933433532715,
1616
+ "learning_rate": 0.00010058076582340782,
1617
+ "loss": 0.7266,
1618
+ "step": 20700
1619
+ },
1620
+ {
1621
+ "epoch": 1.516872983172595,
1622
+ "grad_norm": 0.9414506554603577,
1623
+ "learning_rate": 0.00010008859139679103,
1624
+ "loss": 0.7321,
1625
+ "step": 20800
1626
+ },
1627
+ {
1628
+ "epoch": 1.5241654664454614,
1629
+ "grad_norm": 0.9433586001396179,
1630
+ "learning_rate": 9.959641697017424e-05,
1631
+ "loss": 0.7355,
1632
+ "step": 20900
1633
+ },
1634
+ {
1635
+ "epoch": 1.5314579497183278,
1636
+ "grad_norm": 0.8544315695762634,
1637
+ "learning_rate": 9.910424254355744e-05,
1638
+ "loss": 0.7313,
1639
+ "step": 21000
1640
+ },
1641
+ {
1642
+ "epoch": 1.5314579497183278,
1643
+ "eval_loss": 0.7285299301147461,
1644
+ "eval_runtime": 60.6886,
1645
+ "eval_samples_per_second": 147.557,
1646
+ "eval_steps_per_second": 18.455,
1647
+ "step": 21000
1648
+ },
1649
+ {
1650
+ "epoch": 1.5387504329911943,
1651
+ "grad_norm": 0.893223762512207,
1652
+ "learning_rate": 9.861206811694065e-05,
1653
+ "loss": 0.7329,
1654
+ "step": 21100
1655
+ },
1656
+ {
1657
+ "epoch": 1.546042916264061,
1658
+ "grad_norm": 0.8868634104728699,
1659
+ "learning_rate": 9.811989369032387e-05,
1660
+ "loss": 0.7276,
1661
+ "step": 21200
1662
+ },
1663
+ {
1664
+ "epoch": 1.5533353995369272,
1665
+ "grad_norm": 0.8362566232681274,
1666
+ "learning_rate": 9.762771926370706e-05,
1667
+ "loss": 0.723,
1668
+ "step": 21300
1669
+ },
1670
+ {
1671
+ "epoch": 1.560627882809794,
1672
+ "grad_norm": 0.8852083086967468,
1673
+ "learning_rate": 9.713554483709026e-05,
1674
+ "loss": 0.7281,
1675
+ "step": 21400
1676
+ },
1677
+ {
1678
+ "epoch": 1.5679203660826602,
1679
+ "grad_norm": 0.8901813626289368,
1680
+ "learning_rate": 9.664337041047348e-05,
1681
+ "loss": 0.7307,
1682
+ "step": 21500
1683
+ },
1684
+ {
1685
+ "epoch": 1.5752128493555269,
1686
+ "grad_norm": 0.8210172057151794,
1687
+ "learning_rate": 9.615119598385667e-05,
1688
+ "loss": 0.7245,
1689
+ "step": 21600
1690
+ },
1691
+ {
1692
+ "epoch": 1.5825053326283933,
1693
+ "grad_norm": 0.8676414489746094,
1694
+ "learning_rate": 9.56590215572399e-05,
1695
+ "loss": 0.7294,
1696
+ "step": 21700
1697
+ },
1698
+ {
1699
+ "epoch": 1.5897978159012598,
1700
+ "grad_norm": 0.8923740983009338,
1701
+ "learning_rate": 9.51668471306231e-05,
1702
+ "loss": 0.7242,
1703
+ "step": 21800
1704
+ },
1705
+ {
1706
+ "epoch": 1.5970902991741263,
1707
+ "grad_norm": 0.8402920365333557,
1708
+ "learning_rate": 9.46746727040063e-05,
1709
+ "loss": 0.7258,
1710
+ "step": 21900
1711
+ },
1712
+ {
1713
+ "epoch": 1.6043827824469927,
1714
+ "grad_norm": 0.8525983691215515,
1715
+ "learning_rate": 9.418249827738951e-05,
1716
+ "loss": 0.7294,
1717
+ "step": 22000
1718
+ },
1719
+ {
1720
+ "epoch": 1.6043827824469927,
1721
+ "eval_loss": 0.7267495393753052,
1722
+ "eval_runtime": 61.1086,
1723
+ "eval_samples_per_second": 146.542,
1724
+ "eval_steps_per_second": 18.328,
1725
+ "step": 22000
1726
+ },
1727
+ {
1728
+ "epoch": 1.6116752657198594,
1729
+ "grad_norm": 0.8605002164840698,
1730
+ "learning_rate": 9.369032385077272e-05,
1731
+ "loss": 0.7259,
1732
+ "step": 22100
1733
+ },
1734
+ {
1735
+ "epoch": 1.6189677489927257,
1736
+ "grad_norm": 0.8606895208358765,
1737
+ "learning_rate": 9.319814942415592e-05,
1738
+ "loss": 0.7275,
1739
+ "step": 22200
1740
+ },
1741
+ {
1742
+ "epoch": 1.6262602322655924,
1743
+ "grad_norm": 0.8824227452278137,
1744
+ "learning_rate": 9.270597499753914e-05,
1745
+ "loss": 0.7245,
1746
+ "step": 22300
1747
+ },
1748
+ {
1749
+ "epoch": 1.6335527155384586,
1750
+ "grad_norm": 0.8670118451118469,
1751
+ "learning_rate": 9.221380057092233e-05,
1752
+ "loss": 0.719,
1753
+ "step": 22400
1754
+ },
1755
+ {
1756
+ "epoch": 1.6408451988113253,
1757
+ "grad_norm": 0.92063307762146,
1758
+ "learning_rate": 9.172162614430555e-05,
1759
+ "loss": 0.7293,
1760
+ "step": 22500
1761
+ },
1762
+ {
1763
+ "epoch": 1.6481376820841918,
1764
+ "grad_norm": 0.8425260782241821,
1765
+ "learning_rate": 9.122945171768876e-05,
1766
+ "loss": 0.728,
1767
+ "step": 22600
1768
+ },
1769
+ {
1770
+ "epoch": 1.6554301653570582,
1771
+ "grad_norm": 0.9162302017211914,
1772
+ "learning_rate": 9.073727729107196e-05,
1773
+ "loss": 0.7265,
1774
+ "step": 22700
1775
+ },
1776
+ {
1777
+ "epoch": 1.6627226486299247,
1778
+ "grad_norm": 0.8905067443847656,
1779
+ "learning_rate": 9.024510286445517e-05,
1780
+ "loss": 0.7256,
1781
+ "step": 22800
1782
+ },
1783
+ {
1784
+ "epoch": 1.6700151319027912,
1785
+ "grad_norm": 0.874357283115387,
1786
+ "learning_rate": 8.975292843783837e-05,
1787
+ "loss": 0.7249,
1788
+ "step": 22900
1789
+ },
1790
+ {
1791
+ "epoch": 1.6773076151756579,
1792
+ "grad_norm": 0.842005729675293,
1793
+ "learning_rate": 8.926075401122158e-05,
1794
+ "loss": 0.7268,
1795
+ "step": 23000
1796
+ },
1797
+ {
1798
+ "epoch": 1.6773076151756579,
1799
+ "eval_loss": 0.7241798639297485,
1800
+ "eval_runtime": 60.7958,
1801
+ "eval_samples_per_second": 147.296,
1802
+ "eval_steps_per_second": 18.422,
1803
+ "step": 23000
1804
+ },
1805
+ {
1806
+ "epoch": 1.684600098448524,
1807
+ "grad_norm": 0.8695193529129028,
1808
+ "learning_rate": 8.876857958460478e-05,
1809
+ "loss": 0.7262,
1810
+ "step": 23100
1811
+ },
1812
+ {
1813
+ "epoch": 1.6918925817213908,
1814
+ "grad_norm": 0.8673058748245239,
1815
+ "learning_rate": 8.827640515798799e-05,
1816
+ "loss": 0.7303,
1817
+ "step": 23200
1818
+ },
1819
+ {
1820
+ "epoch": 1.699185064994257,
1821
+ "grad_norm": 0.9276596307754517,
1822
+ "learning_rate": 8.77842307313712e-05,
1823
+ "loss": 0.729,
1824
+ "step": 23300
1825
+ },
1826
+ {
1827
+ "epoch": 1.7064775482671237,
1828
+ "grad_norm": 0.8023722171783447,
1829
+ "learning_rate": 8.729205630475441e-05,
1830
+ "loss": 0.7212,
1831
+ "step": 23400
1832
+ },
1833
+ {
1834
+ "epoch": 1.7137700315399902,
1835
+ "grad_norm": 0.910897433757782,
1836
+ "learning_rate": 8.67998818781376e-05,
1837
+ "loss": 0.7252,
1838
+ "step": 23500
1839
+ },
1840
+ {
1841
+ "epoch": 1.7210625148128567,
1842
+ "grad_norm": 0.8714926838874817,
1843
+ "learning_rate": 8.630770745152083e-05,
1844
+ "loss": 0.7306,
1845
+ "step": 23600
1846
+ },
1847
+ {
1848
+ "epoch": 1.7283549980857231,
1849
+ "grad_norm": 0.8875166773796082,
1850
+ "learning_rate": 8.581553302490403e-05,
1851
+ "loss": 0.7235,
1852
+ "step": 23700
1853
+ },
1854
+ {
1855
+ "epoch": 1.7356474813585896,
1856
+ "grad_norm": 0.9132345914840698,
1857
+ "learning_rate": 8.532335859828724e-05,
1858
+ "loss": 0.7331,
1859
+ "step": 23800
1860
+ },
1861
+ {
1862
+ "epoch": 1.7429399646314563,
1863
+ "grad_norm": 0.8562710285186768,
1864
+ "learning_rate": 8.483118417167044e-05,
1865
+ "loss": 0.7282,
1866
+ "step": 23900
1867
+ },
1868
+ {
1869
+ "epoch": 1.7502324479043225,
1870
+ "grad_norm": 0.867508590221405,
1871
+ "learning_rate": 8.433900974505365e-05,
1872
+ "loss": 0.7256,
1873
+ "step": 24000
1874
+ },
1875
+ {
1876
+ "epoch": 1.7502324479043225,
1877
+ "eval_loss": 0.7232645153999329,
1878
+ "eval_runtime": 60.377,
1879
+ "eval_samples_per_second": 148.318,
1880
+ "eval_steps_per_second": 18.55,
1881
+ "step": 24000
1882
+ },
1883
+ {
1884
+ "epoch": 1.7575249311771892,
1885
+ "grad_norm": 0.8258200287818909,
1886
+ "learning_rate": 8.384683531843685e-05,
1887
+ "loss": 0.7254,
1888
+ "step": 24100
1889
+ },
1890
+ {
1891
+ "epoch": 1.7648174144500555,
1892
+ "grad_norm": 0.9109018445014954,
1893
+ "learning_rate": 8.335466089182007e-05,
1894
+ "loss": 0.7315,
1895
+ "step": 24200
1896
+ },
1897
+ {
1898
+ "epoch": 1.7721098977229222,
1899
+ "grad_norm": 0.8500842452049255,
1900
+ "learning_rate": 8.286248646520326e-05,
1901
+ "loss": 0.7265,
1902
+ "step": 24300
1903
+ },
1904
+ {
1905
+ "epoch": 1.7794023809957886,
1906
+ "grad_norm": 0.9286713600158691,
1907
+ "learning_rate": 8.237031203858648e-05,
1908
+ "loss": 0.7247,
1909
+ "step": 24400
1910
+ },
1911
+ {
1912
+ "epoch": 1.786694864268655,
1913
+ "grad_norm": 0.8746926188468933,
1914
+ "learning_rate": 8.187813761196969e-05,
1915
+ "loss": 0.7261,
1916
+ "step": 24500
1917
+ },
1918
+ {
1919
+ "epoch": 1.7939873475415216,
1920
+ "grad_norm": 0.8702288866043091,
1921
+ "learning_rate": 8.13859631853529e-05,
1922
+ "loss": 0.7207,
1923
+ "step": 24600
1924
+ },
1925
+ {
1926
+ "epoch": 1.801279830814388,
1927
+ "grad_norm": 0.9746344089508057,
1928
+ "learning_rate": 8.08937887587361e-05,
1929
+ "loss": 0.728,
1930
+ "step": 24700
1931
+ },
1932
+ {
1933
+ "epoch": 1.8085723140872547,
1934
+ "grad_norm": 0.8815904259681702,
1935
+ "learning_rate": 8.04016143321193e-05,
1936
+ "loss": 0.7174,
1937
+ "step": 24800
1938
+ },
1939
+ {
1940
+ "epoch": 1.815864797360121,
1941
+ "grad_norm": 0.870474100112915,
1942
+ "learning_rate": 7.990943990550251e-05,
1943
+ "loss": 0.7316,
1944
+ "step": 24900
1945
+ },
1946
+ {
1947
+ "epoch": 1.8231572806329877,
1948
+ "grad_norm": 0.8451401591300964,
1949
+ "learning_rate": 7.941726547888572e-05,
1950
+ "loss": 0.7202,
1951
+ "step": 25000
1952
+ },
1953
+ {
1954
+ "epoch": 1.8231572806329877,
1955
+ "eval_loss": 0.721147358417511,
1956
+ "eval_runtime": 60.8906,
1957
+ "eval_samples_per_second": 147.067,
1958
+ "eval_steps_per_second": 18.394,
1959
+ "step": 25000
1960
+ },
1961
+ {
1962
+ "epoch": 1.830449763905854,
1963
+ "grad_norm": 0.8878180980682373,
1964
+ "learning_rate": 7.892509105226894e-05,
1965
+ "loss": 0.7236,
1966
+ "step": 25100
1967
+ },
1968
+ {
1969
+ "epoch": 1.8377422471787206,
1970
+ "grad_norm": 0.859920859336853,
1971
+ "learning_rate": 7.843291662565213e-05,
1972
+ "loss": 0.7257,
1973
+ "step": 25200
1974
+ },
1975
+ {
1976
+ "epoch": 1.845034730451587,
1977
+ "grad_norm": 0.9358228445053101,
1978
+ "learning_rate": 7.794074219903535e-05,
1979
+ "loss": 0.7175,
1980
+ "step": 25300
1981
+ },
1982
+ {
1983
+ "epoch": 1.8523272137244535,
1984
+ "grad_norm": 0.858906626701355,
1985
+ "learning_rate": 7.744856777241854e-05,
1986
+ "loss": 0.7217,
1987
+ "step": 25400
1988
+ },
1989
+ {
1990
+ "epoch": 1.85961969699732,
1991
+ "grad_norm": 0.9508287310600281,
1992
+ "learning_rate": 7.695639334580176e-05,
1993
+ "loss": 0.7211,
1994
+ "step": 25500
1995
+ },
1996
+ {
1997
+ "epoch": 1.8669121802701865,
1998
+ "grad_norm": 0.9340062141418457,
1999
+ "learning_rate": 7.646421891918496e-05,
2000
+ "loss": 0.7254,
2001
+ "step": 25600
2002
+ },
2003
+ {
2004
+ "epoch": 1.8742046635430532,
2005
+ "grad_norm": 0.9350687861442566,
2006
+ "learning_rate": 7.597204449256817e-05,
2007
+ "loss": 0.7247,
2008
+ "step": 25700
2009
+ },
2010
+ {
2011
+ "epoch": 1.8814971468159194,
2012
+ "grad_norm": 0.9614841938018799,
2013
+ "learning_rate": 7.547987006595137e-05,
2014
+ "loss": 0.7283,
2015
+ "step": 25800
2016
+ },
2017
+ {
2018
+ "epoch": 1.888789630088786,
2019
+ "grad_norm": 0.848640501499176,
2020
+ "learning_rate": 7.49876956393346e-05,
2021
+ "loss": 0.7221,
2022
+ "step": 25900
2023
+ },
2024
+ {
2025
+ "epoch": 1.8960821133616523,
2026
+ "grad_norm": 0.8105534315109253,
2027
+ "learning_rate": 7.449552121271779e-05,
2028
+ "loss": 0.7205,
2029
+ "step": 26000
2030
+ },
2031
+ {
2032
+ "epoch": 1.8960821133616523,
2033
+ "eval_loss": 0.7193262577056885,
2034
+ "eval_runtime": 61.1614,
2035
+ "eval_samples_per_second": 146.416,
2036
+ "eval_steps_per_second": 18.312,
2037
+ "step": 26000
2038
+ },
2039
+ {
2040
+ "epoch": 1.903374596634519,
2041
+ "grad_norm": 0.8522207736968994,
2042
+ "learning_rate": 7.4003346786101e-05,
2043
+ "loss": 0.7223,
2044
+ "step": 26100
2045
+ },
2046
+ {
2047
+ "epoch": 1.9106670799073855,
2048
+ "grad_norm": 0.8983740210533142,
2049
+ "learning_rate": 7.351117235948421e-05,
2050
+ "loss": 0.7208,
2051
+ "step": 26200
2052
+ },
2053
+ {
2054
+ "epoch": 1.917959563180252,
2055
+ "grad_norm": 0.8596473336219788,
2056
+ "learning_rate": 7.301899793286742e-05,
2057
+ "loss": 0.7184,
2058
+ "step": 26300
2059
+ },
2060
+ {
2061
+ "epoch": 1.9252520464531184,
2062
+ "grad_norm": 0.9175098538398743,
2063
+ "learning_rate": 7.252682350625062e-05,
2064
+ "loss": 0.7213,
2065
+ "step": 26400
2066
+ },
2067
+ {
2068
+ "epoch": 1.932544529725985,
2069
+ "grad_norm": 0.8626872897148132,
2070
+ "learning_rate": 7.203464907963383e-05,
2071
+ "loss": 0.7242,
2072
+ "step": 26500
2073
+ },
2074
+ {
2075
+ "epoch": 1.9398370129988516,
2076
+ "grad_norm": 0.859780490398407,
2077
+ "learning_rate": 7.154247465301703e-05,
2078
+ "loss": 0.7197,
2079
+ "step": 26600
2080
+ },
2081
+ {
2082
+ "epoch": 1.9471294962717178,
2083
+ "grad_norm": 0.8713703751564026,
2084
+ "learning_rate": 7.105030022640024e-05,
2085
+ "loss": 0.7231,
2086
+ "step": 26700
2087
+ },
2088
+ {
2089
+ "epoch": 1.9544219795445845,
2090
+ "grad_norm": 0.8976535797119141,
2091
+ "learning_rate": 7.055812579978344e-05,
2092
+ "loss": 0.7233,
2093
+ "step": 26800
2094
+ },
2095
+ {
2096
+ "epoch": 1.9617144628174508,
2097
+ "grad_norm": 0.9257802367210388,
2098
+ "learning_rate": 7.006595137316665e-05,
2099
+ "loss": 0.7221,
2100
+ "step": 26900
2101
+ },
2102
+ {
2103
+ "epoch": 1.9690069460903175,
2104
+ "grad_norm": 0.8592785596847534,
2105
+ "learning_rate": 6.957377694654987e-05,
2106
+ "loss": 0.7168,
2107
+ "step": 27000
2108
+ },
2109
+ {
2110
+ "epoch": 1.9690069460903175,
2111
+ "eval_loss": 0.7180259227752686,
2112
+ "eval_runtime": 60.5352,
2113
+ "eval_samples_per_second": 147.931,
2114
+ "eval_steps_per_second": 18.502,
2115
+ "step": 27000
2116
+ },
2117
+ {
2118
+ "epoch": 1.976299429363184,
2119
+ "grad_norm": 0.8931472897529602,
2120
+ "learning_rate": 6.908160251993306e-05,
2121
+ "loss": 0.7204,
2122
+ "step": 27100
2123
+ },
2124
+ {
2125
+ "epoch": 1.9835919126360504,
2126
+ "grad_norm": 0.8821597695350647,
2127
+ "learning_rate": 6.858942809331628e-05,
2128
+ "loss": 0.7163,
2129
+ "step": 27200
2130
+ },
2131
+ {
2132
+ "epoch": 1.9908843959089169,
2133
+ "grad_norm": 0.8749621510505676,
2134
+ "learning_rate": 6.809725366669948e-05,
2135
+ "loss": 0.711,
2136
+ "step": 27300
2137
+ },
2138
+ {
2139
+ "epoch": 1.9981768791817833,
2140
+ "grad_norm": 0.903332531452179,
2141
+ "learning_rate": 6.760507924008269e-05,
2142
+ "loss": 0.7176,
2143
+ "step": 27400
2144
+ },
2145
+ {
2146
+ "epoch": 2.005505824871014,
2147
+ "grad_norm": 0.854773759841919,
2148
+ "learning_rate": 6.71129048134659e-05,
2149
+ "loss": 0.7187,
2150
+ "step": 27500
2151
+ },
2152
+ {
2153
+ "epoch": 2.0127983081438807,
2154
+ "grad_norm": 0.9489893913269043,
2155
+ "learning_rate": 6.66207303868491e-05,
2156
+ "loss": 0.7096,
2157
+ "step": 27600
2158
+ },
2159
+ {
2160
+ "epoch": 2.020090791416747,
2161
+ "grad_norm": 0.8944621682167053,
2162
+ "learning_rate": 6.61285559602323e-05,
2163
+ "loss": 0.7104,
2164
+ "step": 27700
2165
+ },
2166
+ {
2167
+ "epoch": 2.0273832746896137,
2168
+ "grad_norm": 0.8567011952400208,
2169
+ "learning_rate": 6.563638153361553e-05,
2170
+ "loss": 0.7124,
2171
+ "step": 27800
2172
+ },
2173
+ {
2174
+ "epoch": 2.0346757579624803,
2175
+ "grad_norm": 0.8737155199050903,
2176
+ "learning_rate": 6.514420710699872e-05,
2177
+ "loss": 0.7127,
2178
+ "step": 27900
2179
+ },
2180
+ {
2181
+ "epoch": 2.0419682412353466,
2182
+ "grad_norm": 0.8935887813568115,
2183
+ "learning_rate": 6.465203268038194e-05,
2184
+ "loss": 0.7122,
2185
+ "step": 28000
2186
+ },
2187
+ {
2188
+ "epoch": 2.0419682412353466,
2189
+ "eval_loss": 0.716705858707428,
2190
+ "eval_runtime": 60.7739,
2191
+ "eval_samples_per_second": 147.349,
2192
+ "eval_steps_per_second": 18.429,
2193
+ "step": 28000
2194
+ },
2195
+ {
2196
+ "epoch": 2.0492607245082133,
2197
+ "grad_norm": 0.9452987313270569,
2198
+ "learning_rate": 6.415985825376514e-05,
2199
+ "loss": 0.7112,
2200
+ "step": 28100
2201
+ },
2202
+ {
2203
+ "epoch": 2.0565532077810795,
2204
+ "grad_norm": 0.8650675415992737,
2205
+ "learning_rate": 6.366768382714833e-05,
2206
+ "loss": 0.7079,
2207
+ "step": 28200
2208
+ },
2209
+ {
2210
+ "epoch": 2.063845691053946,
2211
+ "grad_norm": 0.8913034796714783,
2212
+ "learning_rate": 6.317550940053155e-05,
2213
+ "loss": 0.713,
2214
+ "step": 28300
2215
+ },
2216
+ {
2217
+ "epoch": 2.0711381743268125,
2218
+ "grad_norm": 0.9072710275650024,
2219
+ "learning_rate": 6.268333497391476e-05,
2220
+ "loss": 0.7094,
2221
+ "step": 28400
2222
+ },
2223
+ {
2224
+ "epoch": 2.078430657599679,
2225
+ "grad_norm": 0.854245126247406,
2226
+ "learning_rate": 6.219116054729796e-05,
2227
+ "loss": 0.7077,
2228
+ "step": 28500
2229
+ },
2230
+ {
2231
+ "epoch": 2.0857231408725454,
2232
+ "grad_norm": 0.929263174533844,
2233
+ "learning_rate": 6.169898612068117e-05,
2234
+ "loss": 0.7086,
2235
+ "step": 28600
2236
+ },
2237
+ {
2238
+ "epoch": 2.093015624145412,
2239
+ "grad_norm": 0.9356215596199036,
2240
+ "learning_rate": 6.120681169406438e-05,
2241
+ "loss": 0.7157,
2242
+ "step": 28700
2243
+ },
2244
+ {
2245
+ "epoch": 2.100308107418279,
2246
+ "grad_norm": 0.9242870211601257,
2247
+ "learning_rate": 6.071463726744758e-05,
2248
+ "loss": 0.71,
2249
+ "step": 28800
2250
+ },
2251
+ {
2252
+ "epoch": 2.107600590691145,
2253
+ "grad_norm": 0.9065095782279968,
2254
+ "learning_rate": 6.022246284083079e-05,
2255
+ "loss": 0.7095,
2256
+ "step": 28900
2257
+ },
2258
+ {
2259
+ "epoch": 2.1148930739640117,
2260
+ "grad_norm": 0.9081276059150696,
2261
+ "learning_rate": 5.9730288414214e-05,
2262
+ "loss": 0.7096,
2263
+ "step": 29000
2264
+ },
2265
+ {
2266
+ "epoch": 2.1148930739640117,
2267
+ "eval_loss": 0.7152244448661804,
2268
+ "eval_runtime": 60.7986,
2269
+ "eval_samples_per_second": 147.29,
2270
+ "eval_steps_per_second": 18.421,
2271
+ "step": 29000
2272
+ },
2273
+ {
2274
+ "epoch": 2.122185557236878,
2275
+ "grad_norm": 0.8326215744018555,
2276
+ "learning_rate": 5.923811398759721e-05,
2277
+ "loss": 0.7147,
2278
+ "step": 29100
2279
+ },
2280
+ {
2281
+ "epoch": 2.1294780405097447,
2282
+ "grad_norm": 0.9274723529815674,
2283
+ "learning_rate": 5.874593956098041e-05,
2284
+ "loss": 0.7111,
2285
+ "step": 29200
2286
+ },
2287
+ {
2288
+ "epoch": 2.136770523782611,
2289
+ "grad_norm": 0.8282331824302673,
2290
+ "learning_rate": 5.825376513436362e-05,
2291
+ "loss": 0.7137,
2292
+ "step": 29300
2293
+ },
2294
+ {
2295
+ "epoch": 2.1440630070554776,
2296
+ "grad_norm": 0.9081612229347229,
2297
+ "learning_rate": 5.776159070774683e-05,
2298
+ "loss": 0.7115,
2299
+ "step": 29400
2300
+ },
2301
+ {
2302
+ "epoch": 2.151355490328344,
2303
+ "grad_norm": 0.9531508684158325,
2304
+ "learning_rate": 5.726941628113004e-05,
2305
+ "loss": 0.708,
2306
+ "step": 29500
2307
+ },
2308
+ {
2309
+ "epoch": 2.1586479736012105,
2310
+ "grad_norm": 0.9125275611877441,
2311
+ "learning_rate": 5.677724185451324e-05,
2312
+ "loss": 0.7123,
2313
+ "step": 29600
2314
+ },
2315
+ {
2316
+ "epoch": 2.165940456874077,
2317
+ "grad_norm": 0.9363859295845032,
2318
+ "learning_rate": 5.628506742789645e-05,
2319
+ "loss": 0.7146,
2320
+ "step": 29700
2321
+ },
2322
+ {
2323
+ "epoch": 2.1732329401469435,
2324
+ "grad_norm": 0.9164854884147644,
2325
+ "learning_rate": 5.579289300127966e-05,
2326
+ "loss": 0.7121,
2327
+ "step": 29800
2328
+ },
2329
+ {
2330
+ "epoch": 2.18052542341981,
2331
+ "grad_norm": 0.941330075263977,
2332
+ "learning_rate": 5.530071857466287e-05,
2333
+ "loss": 0.7086,
2334
+ "step": 29900
2335
+ },
2336
+ {
2337
+ "epoch": 2.1878179066926764,
2338
+ "grad_norm": 0.9006567597389221,
2339
+ "learning_rate": 5.480854414804607e-05,
2340
+ "loss": 0.7097,
2341
+ "step": 30000
2342
+ },
2343
+ {
2344
+ "epoch": 2.1878179066926764,
2345
+ "eval_loss": 0.7143043875694275,
2346
+ "eval_runtime": 61.0555,
2347
+ "eval_samples_per_second": 146.67,
2348
+ "eval_steps_per_second": 18.344,
2349
+ "step": 30000
2350
+ },
2351
+ {
2352
+ "epoch": 2.195110389965543,
2353
+ "grad_norm": 0.8913944363594055,
2354
+ "learning_rate": 5.431636972142927e-05,
2355
+ "loss": 0.7066,
2356
+ "step": 30100
2357
+ },
2358
+ {
2359
+ "epoch": 2.2024028732384093,
2360
+ "grad_norm": 0.9200546145439148,
2361
+ "learning_rate": 5.3824195294812486e-05,
2362
+ "loss": 0.7076,
2363
+ "step": 30200
2364
+ },
2365
+ {
2366
+ "epoch": 2.209695356511276,
2367
+ "grad_norm": 0.924148440361023,
2368
+ "learning_rate": 5.3332020868195684e-05,
2369
+ "loss": 0.7058,
2370
+ "step": 30300
2371
+ },
2372
+ {
2373
+ "epoch": 2.2169878397841423,
2374
+ "grad_norm": 0.922255277633667,
2375
+ "learning_rate": 5.2839846441578897e-05,
2376
+ "loss": 0.7108,
2377
+ "step": 30400
2378
+ },
2379
+ {
2380
+ "epoch": 2.224280323057009,
2381
+ "grad_norm": 0.9039818644523621,
2382
+ "learning_rate": 5.23476720149621e-05,
2383
+ "loss": 0.7091,
2384
+ "step": 30500
2385
+ },
2386
+ {
2387
+ "epoch": 2.2315728063298756,
2388
+ "grad_norm": 0.963845431804657,
2389
+ "learning_rate": 5.1855497588345314e-05,
2390
+ "loss": 0.7065,
2391
+ "step": 30600
2392
+ },
2393
+ {
2394
+ "epoch": 2.238865289602742,
2395
+ "grad_norm": 0.8838880658149719,
2396
+ "learning_rate": 5.136332316172851e-05,
2397
+ "loss": 0.7113,
2398
+ "step": 30700
2399
+ },
2400
+ {
2401
+ "epoch": 2.2461577728756086,
2402
+ "grad_norm": 0.9642555117607117,
2403
+ "learning_rate": 5.0871148735111725e-05,
2404
+ "loss": 0.7062,
2405
+ "step": 30800
2406
+ },
2407
+ {
2408
+ "epoch": 2.253450256148475,
2409
+ "grad_norm": 0.9088276624679565,
2410
+ "learning_rate": 5.037897430849493e-05,
2411
+ "loss": 0.7071,
2412
+ "step": 30900
2413
+ },
2414
+ {
2415
+ "epoch": 2.2607427394213415,
2416
+ "grad_norm": 0.9083282351493835,
2417
+ "learning_rate": 4.9886799881878137e-05,
2418
+ "loss": 0.7126,
2419
+ "step": 31000
2420
+ },
2421
+ {
2422
+ "epoch": 2.2607427394213415,
2423
+ "eval_loss": 0.7129958868026733,
2424
+ "eval_runtime": 60.7821,
2425
+ "eval_samples_per_second": 147.33,
2426
+ "eval_steps_per_second": 18.426,
2427
+ "step": 31000
2428
+ },
2429
+ {
2430
+ "epoch": 2.2680352226942078,
2431
+ "grad_norm": 0.886710524559021,
2432
+ "learning_rate": 4.939462545526134e-05,
2433
+ "loss": 0.7043,
2434
+ "step": 31100
2435
+ },
2436
+ {
2437
+ "epoch": 2.2753277059670745,
2438
+ "grad_norm": 0.8600069880485535,
2439
+ "learning_rate": 4.8902451028644554e-05,
2440
+ "loss": 0.7074,
2441
+ "step": 31200
2442
+ },
2443
+ {
2444
+ "epoch": 2.2826201892399407,
2445
+ "grad_norm": 0.8897703289985657,
2446
+ "learning_rate": 4.841027660202776e-05,
2447
+ "loss": 0.7068,
2448
+ "step": 31300
2449
+ },
2450
+ {
2451
+ "epoch": 2.2899126725128074,
2452
+ "grad_norm": 0.8638718724250793,
2453
+ "learning_rate": 4.7918102175410965e-05,
2454
+ "loss": 0.7062,
2455
+ "step": 31400
2456
+ },
2457
+ {
2458
+ "epoch": 2.297205155785674,
2459
+ "grad_norm": 0.8973529934883118,
2460
+ "learning_rate": 4.742592774879418e-05,
2461
+ "loss": 0.7073,
2462
+ "step": 31500
2463
+ },
2464
+ {
2465
+ "epoch": 2.3044976390585403,
2466
+ "grad_norm": 0.9759765267372131,
2467
+ "learning_rate": 4.693375332217738e-05,
2468
+ "loss": 0.7087,
2469
+ "step": 31600
2470
+ },
2471
+ {
2472
+ "epoch": 2.311790122331407,
2473
+ "grad_norm": 0.9061428904533386,
2474
+ "learning_rate": 4.644157889556059e-05,
2475
+ "loss": 0.708,
2476
+ "step": 31700
2477
+ },
2478
+ {
2479
+ "epoch": 2.3190826056042733,
2480
+ "grad_norm": 0.8808257579803467,
2481
+ "learning_rate": 4.5949404468943794e-05,
2482
+ "loss": 0.7086,
2483
+ "step": 31800
2484
+ },
2485
+ {
2486
+ "epoch": 2.32637508887714,
2487
+ "grad_norm": 0.9116071462631226,
2488
+ "learning_rate": 4.545723004232701e-05,
2489
+ "loss": 0.7118,
2490
+ "step": 31900
2491
+ },
2492
+ {
2493
+ "epoch": 2.333667572150006,
2494
+ "grad_norm": 0.9131873846054077,
2495
+ "learning_rate": 4.496505561571021e-05,
2496
+ "loss": 0.7043,
2497
+ "step": 32000
2498
+ },
2499
+ {
2500
+ "epoch": 2.333667572150006,
2501
+ "eval_loss": 0.7112506031990051,
2502
+ "eval_runtime": 61.1535,
2503
+ "eval_samples_per_second": 146.435,
2504
+ "eval_steps_per_second": 18.315,
2505
+ "step": 32000
2506
+ },
2507
+ {
2508
+ "epoch": 2.340960055422873,
2509
+ "grad_norm": 0.9860331416130066,
2510
+ "learning_rate": 4.447288118909342e-05,
2511
+ "loss": 0.7063,
2512
+ "step": 32100
2513
+ },
2514
+ {
2515
+ "epoch": 2.348252538695739,
2516
+ "grad_norm": 0.933958888053894,
2517
+ "learning_rate": 4.398070676247662e-05,
2518
+ "loss": 0.708,
2519
+ "step": 32200
2520
+ },
2521
+ {
2522
+ "epoch": 2.355545021968606,
2523
+ "grad_norm": 0.8994225859642029,
2524
+ "learning_rate": 4.3488532335859836e-05,
2525
+ "loss": 0.7089,
2526
+ "step": 32300
2527
+ },
2528
+ {
2529
+ "epoch": 2.3628375052414725,
2530
+ "grad_norm": 0.9435915946960449,
2531
+ "learning_rate": 4.299635790924304e-05,
2532
+ "loss": 0.7057,
2533
+ "step": 32400
2534
+ },
2535
+ {
2536
+ "epoch": 2.3701299885143388,
2537
+ "grad_norm": 0.888438880443573,
2538
+ "learning_rate": 4.2504183482626247e-05,
2539
+ "loss": 0.7012,
2540
+ "step": 32500
2541
+ },
2542
+ {
2543
+ "epoch": 2.3774224717872054,
2544
+ "grad_norm": 0.8772885799407959,
2545
+ "learning_rate": 4.201200905600945e-05,
2546
+ "loss": 0.7071,
2547
+ "step": 32600
2548
+ },
2549
+ {
2550
+ "epoch": 2.3847149550600717,
2551
+ "grad_norm": 0.9333481788635254,
2552
+ "learning_rate": 4.151983462939266e-05,
2553
+ "loss": 0.7095,
2554
+ "step": 32700
2555
+ },
2556
+ {
2557
+ "epoch": 2.3920074383329384,
2558
+ "grad_norm": 0.9497707486152649,
2559
+ "learning_rate": 4.102766020277586e-05,
2560
+ "loss": 0.7115,
2561
+ "step": 32800
2562
+ },
2563
+ {
2564
+ "epoch": 2.3992999216058046,
2565
+ "grad_norm": 0.9641472697257996,
2566
+ "learning_rate": 4.053548577615907e-05,
2567
+ "loss": 0.712,
2568
+ "step": 32900
2569
+ },
2570
+ {
2571
+ "epoch": 2.4065924048786713,
2572
+ "grad_norm": 0.8958153128623962,
2573
+ "learning_rate": 4.004331134954228e-05,
2574
+ "loss": 0.7035,
2575
+ "step": 33000
2576
+ },
2577
+ {
2578
+ "epoch": 2.4065924048786713,
2579
+ "eval_loss": 0.7100856304168701,
2580
+ "eval_runtime": 61.2325,
2581
+ "eval_samples_per_second": 146.246,
2582
+ "eval_steps_per_second": 18.291,
2583
+ "step": 33000
2584
+ },
2585
+ {
2586
+ "epoch": 2.4138848881515376,
2587
+ "grad_norm": 0.8818393349647522,
2588
+ "learning_rate": 3.9551136922925487e-05,
2589
+ "loss": 0.7052,
2590
+ "step": 33100
2591
+ },
2592
+ {
2593
+ "epoch": 2.4211773714244043,
2594
+ "grad_norm": 0.8973012566566467,
2595
+ "learning_rate": 3.905896249630869e-05,
2596
+ "loss": 0.706,
2597
+ "step": 33200
2598
+ },
2599
+ {
2600
+ "epoch": 2.428469854697271,
2601
+ "grad_norm": 0.8582873344421387,
2602
+ "learning_rate": 3.85667880696919e-05,
2603
+ "loss": 0.7088,
2604
+ "step": 33300
2605
+ },
2606
+ {
2607
+ "epoch": 2.435762337970137,
2608
+ "grad_norm": 0.9306252002716064,
2609
+ "learning_rate": 3.807461364307511e-05,
2610
+ "loss": 0.7062,
2611
+ "step": 33400
2612
+ },
2613
+ {
2614
+ "epoch": 2.443054821243004,
2615
+ "grad_norm": 0.8586992025375366,
2616
+ "learning_rate": 3.7582439216458315e-05,
2617
+ "loss": 0.7086,
2618
+ "step": 33500
2619
+ },
2620
+ {
2621
+ "epoch": 2.45034730451587,
2622
+ "grad_norm": 0.9076369404792786,
2623
+ "learning_rate": 3.709026478984152e-05,
2624
+ "loss": 0.7052,
2625
+ "step": 33600
2626
+ },
2627
+ {
2628
+ "epoch": 2.457639787788737,
2629
+ "grad_norm": 0.8954334855079651,
2630
+ "learning_rate": 3.6598090363224727e-05,
2631
+ "loss": 0.7082,
2632
+ "step": 33700
2633
+ },
2634
+ {
2635
+ "epoch": 2.464932271061603,
2636
+ "grad_norm": 0.9315345287322998,
2637
+ "learning_rate": 3.610591593660794e-05,
2638
+ "loss": 0.7058,
2639
+ "step": 33800
2640
+ },
2641
+ {
2642
+ "epoch": 2.4722247543344698,
2643
+ "grad_norm": 0.9223620295524597,
2644
+ "learning_rate": 3.5613741509991144e-05,
2645
+ "loss": 0.6992,
2646
+ "step": 33900
2647
+ },
2648
+ {
2649
+ "epoch": 2.479517237607336,
2650
+ "grad_norm": 0.9349290132522583,
2651
+ "learning_rate": 3.512156708337435e-05,
2652
+ "loss": 0.7084,
2653
+ "step": 34000
2654
+ },
2655
+ {
2656
+ "epoch": 2.479517237607336,
2657
+ "eval_loss": 0.7087690234184265,
2658
+ "eval_runtime": 60.8859,
2659
+ "eval_samples_per_second": 147.078,
2660
+ "eval_steps_per_second": 18.395,
2661
+ "step": 34000
2662
+ },
2663
+ {
2664
+ "epoch": 2.4868097208802027,
2665
+ "grad_norm": 0.883210301399231,
2666
+ "learning_rate": 3.462939265675756e-05,
2667
+ "loss": 0.7061,
2668
+ "step": 34100
2669
+ },
2670
+ {
2671
+ "epoch": 2.4941022041530694,
2672
+ "grad_norm": 0.920868456363678,
2673
+ "learning_rate": 3.413721823014077e-05,
2674
+ "loss": 0.7069,
2675
+ "step": 34200
2676
+ },
2677
+ {
2678
+ "epoch": 2.5013946874259356,
2679
+ "grad_norm": 0.9177393913269043,
2680
+ "learning_rate": 3.3645043803523966e-05,
2681
+ "loss": 0.7071,
2682
+ "step": 34300
2683
+ },
2684
+ {
2685
+ "epoch": 2.5086871706988023,
2686
+ "grad_norm": 0.9114101529121399,
2687
+ "learning_rate": 3.315286937690717e-05,
2688
+ "loss": 0.7072,
2689
+ "step": 34400
2690
+ },
2691
+ {
2692
+ "epoch": 2.5159796539716686,
2693
+ "grad_norm": 0.9645174145698547,
2694
+ "learning_rate": 3.2660694950290384e-05,
2695
+ "loss": 0.7028,
2696
+ "step": 34500
2697
+ },
2698
+ {
2699
+ "epoch": 2.5232721372445353,
2700
+ "grad_norm": 0.8982295989990234,
2701
+ "learning_rate": 3.216852052367359e-05,
2702
+ "loss": 0.7085,
2703
+ "step": 34600
2704
+ },
2705
+ {
2706
+ "epoch": 2.530564620517402,
2707
+ "grad_norm": 0.8964338898658752,
2708
+ "learning_rate": 3.1676346097056795e-05,
2709
+ "loss": 0.7069,
2710
+ "step": 34700
2711
+ },
2712
+ {
2713
+ "epoch": 2.537857103790268,
2714
+ "grad_norm": 0.9609666466712952,
2715
+ "learning_rate": 3.118417167044001e-05,
2716
+ "loss": 0.7057,
2717
+ "step": 34800
2718
+ },
2719
+ {
2720
+ "epoch": 2.5451495870631344,
2721
+ "grad_norm": 0.9131038188934326,
2722
+ "learning_rate": 3.069199724382321e-05,
2723
+ "loss": 0.7031,
2724
+ "step": 34900
2725
+ },
2726
+ {
2727
+ "epoch": 2.552442070336001,
2728
+ "grad_norm": 0.9127321839332581,
2729
+ "learning_rate": 3.019982281720642e-05,
2730
+ "loss": 0.6979,
2731
+ "step": 35000
2732
+ },
2733
+ {
2734
+ "epoch": 2.552442070336001,
2735
+ "eval_loss": 0.7076790928840637,
2736
+ "eval_runtime": 61.0966,
2737
+ "eval_samples_per_second": 146.571,
2738
+ "eval_steps_per_second": 18.332,
2739
+ "step": 35000
2740
+ },
2741
+ {
2742
+ "epoch": 2.559734553608868,
2743
+ "grad_norm": 0.9567495584487915,
2744
+ "learning_rate": 2.9707648390589628e-05,
2745
+ "loss": 0.7053,
2746
+ "step": 35100
2747
+ },
2748
+ {
2749
+ "epoch": 2.567027036881734,
2750
+ "grad_norm": 0.9740573763847351,
2751
+ "learning_rate": 2.9215473963972833e-05,
2752
+ "loss": 0.7077,
2753
+ "step": 35200
2754
+ },
2755
+ {
2756
+ "epoch": 2.5743195201546007,
2757
+ "grad_norm": 0.8982974886894226,
2758
+ "learning_rate": 2.8723299537356042e-05,
2759
+ "loss": 0.6983,
2760
+ "step": 35300
2761
+ },
2762
+ {
2763
+ "epoch": 2.581612003427467,
2764
+ "grad_norm": 1.0185188055038452,
2765
+ "learning_rate": 2.8231125110739248e-05,
2766
+ "loss": 0.7069,
2767
+ "step": 35400
2768
+ },
2769
+ {
2770
+ "epoch": 2.5889044867003337,
2771
+ "grad_norm": 0.94049471616745,
2772
+ "learning_rate": 2.7738950684122457e-05,
2773
+ "loss": 0.7054,
2774
+ "step": 35500
2775
+ },
2776
+ {
2777
+ "epoch": 2.5961969699732004,
2778
+ "grad_norm": 0.8923749923706055,
2779
+ "learning_rate": 2.7246776257505662e-05,
2780
+ "loss": 0.7015,
2781
+ "step": 35600
2782
+ },
2783
+ {
2784
+ "epoch": 2.6034894532460666,
2785
+ "grad_norm": 0.9568887948989868,
2786
+ "learning_rate": 2.675460183088887e-05,
2787
+ "loss": 0.7025,
2788
+ "step": 35700
2789
+ },
2790
+ {
2791
+ "epoch": 2.610781936518933,
2792
+ "grad_norm": 0.9106321334838867,
2793
+ "learning_rate": 2.6262427404272077e-05,
2794
+ "loss": 0.7049,
2795
+ "step": 35800
2796
+ },
2797
+ {
2798
+ "epoch": 2.6180744197917996,
2799
+ "grad_norm": 0.9499268531799316,
2800
+ "learning_rate": 2.5770252977655285e-05,
2801
+ "loss": 0.7021,
2802
+ "step": 35900
2803
+ },
2804
+ {
2805
+ "epoch": 2.6253669030646662,
2806
+ "grad_norm": 0.8965421915054321,
2807
+ "learning_rate": 2.5278078551038488e-05,
2808
+ "loss": 0.7036,
2809
+ "step": 36000
2810
+ },
2811
+ {
2812
+ "epoch": 2.6253669030646662,
2813
+ "eval_loss": 0.7065343856811523,
2814
+ "eval_runtime": 61.0446,
2815
+ "eval_samples_per_second": 146.696,
2816
+ "eval_steps_per_second": 18.347,
2817
+ "step": 36000
2818
+ },
2819
+ {
2820
+ "epoch": 2.6326593863375325,
2821
+ "grad_norm": 0.94576096534729,
2822
+ "learning_rate": 2.4785904124421696e-05,
2823
+ "loss": 0.71,
2824
+ "step": 36100
2825
+ },
2826
+ {
2827
+ "epoch": 2.639951869610399,
2828
+ "grad_norm": 0.962692141532898,
2829
+ "learning_rate": 2.4293729697804905e-05,
2830
+ "loss": 0.6953,
2831
+ "step": 36200
2832
+ },
2833
+ {
2834
+ "epoch": 2.6472443528832654,
2835
+ "grad_norm": 0.9457094669342041,
2836
+ "learning_rate": 2.380155527118811e-05,
2837
+ "loss": 0.7011,
2838
+ "step": 36300
2839
+ },
2840
+ {
2841
+ "epoch": 2.654536836156132,
2842
+ "grad_norm": 0.9523045420646667,
2843
+ "learning_rate": 2.330938084457132e-05,
2844
+ "loss": 0.7093,
2845
+ "step": 36400
2846
+ },
2847
+ {
2848
+ "epoch": 2.661829319428999,
2849
+ "grad_norm": 0.9255204796791077,
2850
+ "learning_rate": 2.2817206417954522e-05,
2851
+ "loss": 0.6979,
2852
+ "step": 36500
2853
+ },
2854
+ {
2855
+ "epoch": 2.669121802701865,
2856
+ "grad_norm": 1.015286922454834,
2857
+ "learning_rate": 2.232503199133773e-05,
2858
+ "loss": 0.7044,
2859
+ "step": 36600
2860
+ },
2861
+ {
2862
+ "epoch": 2.6764142859747313,
2863
+ "grad_norm": 0.8911315202713013,
2864
+ "learning_rate": 2.1832857564720936e-05,
2865
+ "loss": 0.7031,
2866
+ "step": 36700
2867
+ },
2868
+ {
2869
+ "epoch": 2.683706769247598,
2870
+ "grad_norm": 0.9372689127922058,
2871
+ "learning_rate": 2.1340683138104145e-05,
2872
+ "loss": 0.7019,
2873
+ "step": 36800
2874
+ },
2875
+ {
2876
+ "epoch": 2.6909992525204647,
2877
+ "grad_norm": 0.9245051145553589,
2878
+ "learning_rate": 2.084850871148735e-05,
2879
+ "loss": 0.7065,
2880
+ "step": 36900
2881
+ },
2882
+ {
2883
+ "epoch": 2.698291735793331,
2884
+ "grad_norm": 0.917607843875885,
2885
+ "learning_rate": 2.035633428487056e-05,
2886
+ "loss": 0.7016,
2887
+ "step": 37000
2888
+ },
2889
+ {
2890
+ "epoch": 2.698291735793331,
2891
+ "eval_loss": 0.7054994702339172,
2892
+ "eval_runtime": 60.6541,
2893
+ "eval_samples_per_second": 147.64,
2894
+ "eval_steps_per_second": 18.465,
2895
+ "step": 37000
2896
+ },
2897
+ {
2898
+ "epoch": 2.7055842190661976,
2899
+ "grad_norm": 0.9054610729217529,
2900
+ "learning_rate": 1.9864159858253765e-05,
2901
+ "loss": 0.7034,
2902
+ "step": 37100
2903
+ },
2904
+ {
2905
+ "epoch": 2.712876702339064,
2906
+ "grad_norm": 0.960075855255127,
2907
+ "learning_rate": 1.9371985431636974e-05,
2908
+ "loss": 0.7097,
2909
+ "step": 37200
2910
+ },
2911
+ {
2912
+ "epoch": 2.7201691856119306,
2913
+ "grad_norm": 0.9454420208930969,
2914
+ "learning_rate": 1.887981100502018e-05,
2915
+ "loss": 0.7046,
2916
+ "step": 37300
2917
+ },
2918
+ {
2919
+ "epoch": 2.7274616688847972,
2920
+ "grad_norm": 0.8761453628540039,
2921
+ "learning_rate": 1.8387636578403385e-05,
2922
+ "loss": 0.7068,
2923
+ "step": 37400
2924
+ },
2925
+ {
2926
+ "epoch": 2.7347541521576635,
2927
+ "grad_norm": 0.9231957793235779,
2928
+ "learning_rate": 1.7895462151786594e-05,
2929
+ "loss": 0.6983,
2930
+ "step": 37500
2931
+ },
2932
+ {
2933
+ "epoch": 2.7420466354305297,
2934
+ "grad_norm": 0.8630309104919434,
2935
+ "learning_rate": 1.74032877251698e-05,
2936
+ "loss": 0.6984,
2937
+ "step": 37600
2938
+ },
2939
+ {
2940
+ "epoch": 2.7493391187033964,
2941
+ "grad_norm": 0.9077728986740112,
2942
+ "learning_rate": 1.691111329855301e-05,
2943
+ "loss": 0.7097,
2944
+ "step": 37700
2945
+ },
2946
+ {
2947
+ "epoch": 2.756631601976263,
2948
+ "grad_norm": 0.9849316477775574,
2949
+ "learning_rate": 1.6418938871936214e-05,
2950
+ "loss": 0.7025,
2951
+ "step": 37800
2952
+ },
2953
+ {
2954
+ "epoch": 2.7639240852491294,
2955
+ "grad_norm": 0.9101927280426025,
2956
+ "learning_rate": 1.5926764445319423e-05,
2957
+ "loss": 0.7127,
2958
+ "step": 37900
2959
+ },
2960
+ {
2961
+ "epoch": 2.771216568521996,
2962
+ "grad_norm": 0.9624613523483276,
2963
+ "learning_rate": 1.543459001870263e-05,
2964
+ "loss": 0.7038,
2965
+ "step": 38000
2966
+ },
2967
+ {
2968
+ "epoch": 2.771216568521996,
2969
+ "eval_loss": 0.7042670845985413,
2970
+ "eval_runtime": 60.6288,
2971
+ "eval_samples_per_second": 147.702,
2972
+ "eval_steps_per_second": 18.473,
2973
+ "step": 38000
2974
+ },
2975
+ {
2976
+ "epoch": 2.7785090517948623,
2977
+ "grad_norm": 0.8926946520805359,
2978
+ "learning_rate": 1.4942415592085838e-05,
2979
+ "loss": 0.6955,
2980
+ "step": 38100
2981
+ },
2982
+ {
2983
+ "epoch": 2.785801535067729,
2984
+ "grad_norm": 0.9353916645050049,
2985
+ "learning_rate": 1.4450241165469041e-05,
2986
+ "loss": 0.7003,
2987
+ "step": 38200
2988
+ },
2989
+ {
2990
+ "epoch": 2.7930940183405957,
2991
+ "grad_norm": 0.9394625425338745,
2992
+ "learning_rate": 1.3958066738852249e-05,
2993
+ "loss": 0.6963,
2994
+ "step": 38300
2995
+ },
2996
+ {
2997
+ "epoch": 2.800386501613462,
2998
+ "grad_norm": 0.8811284303665161,
2999
+ "learning_rate": 1.3465892312235456e-05,
3000
+ "loss": 0.7057,
3001
+ "step": 38400
3002
+ },
3003
+ {
3004
+ "epoch": 2.807678984886328,
3005
+ "grad_norm": 0.9111167788505554,
3006
+ "learning_rate": 1.2973717885618663e-05,
3007
+ "loss": 0.6905,
3008
+ "step": 38500
3009
+ },
3010
+ {
3011
+ "epoch": 2.814971468159195,
3012
+ "grad_norm": 0.9061198830604553,
3013
+ "learning_rate": 1.248154345900187e-05,
3014
+ "loss": 0.6966,
3015
+ "step": 38600
3016
+ },
3017
+ {
3018
+ "epoch": 2.8222639514320615,
3019
+ "grad_norm": 0.917921781539917,
3020
+ "learning_rate": 1.1989369032385078e-05,
3021
+ "loss": 0.7055,
3022
+ "step": 38700
3023
+ },
3024
+ {
3025
+ "epoch": 2.829556434704928,
3026
+ "grad_norm": 0.9210913777351379,
3027
+ "learning_rate": 1.1497194605768285e-05,
3028
+ "loss": 0.7004,
3029
+ "step": 38800
3030
+ },
3031
+ {
3032
+ "epoch": 2.8368489179777945,
3033
+ "grad_norm": 0.9152899384498596,
3034
+ "learning_rate": 1.1005020179151492e-05,
3035
+ "loss": 0.7065,
3036
+ "step": 38900
3037
+ },
3038
+ {
3039
+ "epoch": 2.8441414012506607,
3040
+ "grad_norm": 0.9237668514251709,
3041
+ "learning_rate": 1.05128457525347e-05,
3042
+ "loss": 0.7027,
3043
+ "step": 39000
3044
+ },
3045
+ {
3046
+ "epoch": 2.8441414012506607,
3047
+ "eval_loss": 0.7034493088722229,
3048
+ "eval_runtime": 60.6775,
3049
+ "eval_samples_per_second": 147.583,
3050
+ "eval_steps_per_second": 18.458,
3051
+ "step": 39000
3052
+ },
3053
+ {
3054
+ "epoch": 2.8514338845235274,
3055
+ "grad_norm": 0.9577778577804565,
3056
+ "learning_rate": 1.0020671325917906e-05,
3057
+ "loss": 0.7064,
3058
+ "step": 39100
3059
+ },
3060
+ {
3061
+ "epoch": 2.858726367796394,
3062
+ "grad_norm": 0.9955913424491882,
3063
+ "learning_rate": 9.528496899301114e-06,
3064
+ "loss": 0.7017,
3065
+ "step": 39200
3066
+ },
3067
+ {
3068
+ "epoch": 2.8660188510692604,
3069
+ "grad_norm": 0.9187660217285156,
3070
+ "learning_rate": 9.03632247268432e-06,
3071
+ "loss": 0.6998,
3072
+ "step": 39300
3073
+ },
3074
+ {
3075
+ "epoch": 2.8733113343421266,
3076
+ "grad_norm": 0.9275550842285156,
3077
+ "learning_rate": 8.544148046067526e-06,
3078
+ "loss": 0.7002,
3079
+ "step": 39400
3080
+ },
3081
+ {
3082
+ "epoch": 2.8806038176149933,
3083
+ "grad_norm": 0.9114721417427063,
3084
+ "learning_rate": 8.051973619450734e-06,
3085
+ "loss": 0.7027,
3086
+ "step": 39500
3087
+ },
3088
+ {
3089
+ "epoch": 2.88789630088786,
3090
+ "grad_norm": 0.9408327341079712,
3091
+ "learning_rate": 7.559799192833941e-06,
3092
+ "loss": 0.7034,
3093
+ "step": 39600
3094
+ },
3095
+ {
3096
+ "epoch": 2.8951887841607262,
3097
+ "grad_norm": 0.9538366198539734,
3098
+ "learning_rate": 7.067624766217147e-06,
3099
+ "loss": 0.7007,
3100
+ "step": 39700
3101
+ },
3102
+ {
3103
+ "epoch": 2.902481267433593,
3104
+ "grad_norm": 0.923864483833313,
3105
+ "learning_rate": 6.5754503396003544e-06,
3106
+ "loss": 0.6972,
3107
+ "step": 39800
3108
+ },
3109
+ {
3110
+ "epoch": 2.909773750706459,
3111
+ "grad_norm": 0.9156636595726013,
3112
+ "learning_rate": 6.083275912983562e-06,
3113
+ "loss": 0.7064,
3114
+ "step": 39900
3115
+ },
3116
+ {
3117
+ "epoch": 2.917066233979326,
3118
+ "grad_norm": 0.9568312168121338,
3119
+ "learning_rate": 5.591101486366768e-06,
3120
+ "loss": 0.6969,
3121
+ "step": 40000
3122
+ },
3123
+ {
3124
+ "epoch": 2.917066233979326,
3125
+ "eval_loss": 0.7027888894081116,
3126
+ "eval_runtime": 61.1155,
3127
+ "eval_samples_per_second": 146.526,
3128
+ "eval_steps_per_second": 18.326,
3129
+ "step": 40000
3130
+ },
3131
+ {
3132
+ "epoch": 2.9243587172521925,
3133
+ "grad_norm": 0.9376012086868286,
3134
+ "learning_rate": 5.098927059749975e-06,
3135
+ "loss": 0.7,
3136
+ "step": 40100
3137
+ },
3138
+ {
3139
+ "epoch": 2.931651200525059,
3140
+ "grad_norm": 0.9648913145065308,
3141
+ "learning_rate": 4.6067526331331825e-06,
3142
+ "loss": 0.7042,
3143
+ "step": 40200
3144
+ },
3145
+ {
3146
+ "epoch": 2.938943683797925,
3147
+ "grad_norm": 0.9452090263366699,
3148
+ "learning_rate": 4.11457820651639e-06,
3149
+ "loss": 0.7041,
3150
+ "step": 40300
3151
+ },
3152
+ {
3153
+ "epoch": 2.9462361670707917,
3154
+ "grad_norm": 0.9553784728050232,
3155
+ "learning_rate": 3.622403779899597e-06,
3156
+ "loss": 0.7005,
3157
+ "step": 40400
3158
+ },
3159
+ {
3160
+ "epoch": 2.9535286503436584,
3161
+ "grad_norm": 0.8788447380065918,
3162
+ "learning_rate": 3.1302293532828033e-06,
3163
+ "loss": 0.6974,
3164
+ "step": 40500
3165
+ },
3166
+ {
3167
+ "epoch": 2.9608211336165247,
3168
+ "grad_norm": 0.9146846532821655,
3169
+ "learning_rate": 2.6380549266660105e-06,
3170
+ "loss": 0.7004,
3171
+ "step": 40600
3172
+ },
3173
+ {
3174
+ "epoch": 2.9681136168893913,
3175
+ "grad_norm": 0.9674293398857117,
3176
+ "learning_rate": 2.1458805000492173e-06,
3177
+ "loss": 0.7028,
3178
+ "step": 40700
3179
+ },
3180
+ {
3181
+ "epoch": 2.9754061001622576,
3182
+ "grad_norm": 0.9374125599861145,
3183
+ "learning_rate": 1.6537060734324243e-06,
3184
+ "loss": 0.7008,
3185
+ "step": 40800
3186
+ },
3187
+ {
3188
+ "epoch": 2.9826985834351243,
3189
+ "grad_norm": 0.9554013013839722,
3190
+ "learning_rate": 1.1615316468156316e-06,
3191
+ "loss": 0.7011,
3192
+ "step": 40900
3193
+ },
3194
+ {
3195
+ "epoch": 2.989991066707991,
3196
+ "grad_norm": 0.8910831212997437,
3197
+ "learning_rate": 6.693572201988385e-07,
3198
+ "loss": 0.6992,
3199
+ "step": 41000
3200
+ },
3201
+ {
3202
+ "epoch": 2.989991066707991,
3203
+ "eval_loss": 0.7023043632507324,
3204
+ "eval_runtime": 61.2519,
3205
+ "eval_samples_per_second": 146.2,
3206
+ "eval_steps_per_second": 18.285,
3207
+ "step": 41000
3208
+ },
3209
+ {
3210
+ "epoch": 2.997283549980857,
3211
+ "grad_norm": 0.9466680288314819,
3212
+ "learning_rate": 1.771827935820455e-07,
3213
+ "loss": 0.6961,
3214
+ "step": 41100
3215
+ }
3216
+ ],
3217
+ "logging_steps": 100,
3218
+ "max_steps": 41136,
3219
+ "num_input_tokens_seen": 0,
3220
+ "num_train_epochs": 3,
3221
+ "save_steps": 1000,
3222
+ "stateful_callbacks": {
3223
+ "TrainerControl": {
3224
+ "args": {
3225
+ "should_epoch_stop": false,
3226
+ "should_evaluate": false,
3227
+ "should_log": false,
3228
+ "should_save": true,
3229
+ "should_training_stop": true
3230
+ },
3231
+ "attributes": {}
3232
+ }
3233
+ },
3234
+ "total_flos": 7.280129344536576e+17,
3235
+ "train_batch_size": 8,
3236
+ "trial_name": null,
3237
+ "trial_params": null
3238
+ }
checkpoint-41136/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ffa18fa243cccfbf729510f7d83fcb184f78dfbd7718a3073ec148d996a46094
3
+ size 5713
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>"
5
+ ],
6
+ "eos_token": {
7
+ "content": "<|im_end|>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false
12
+ },
13
+ "pad_token": "<|im_end|>"
14
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:047cc1002ac6795c1352776b646cdcd785be6cba5fd35ccec8909d0672eae7e5
3
+ size 11418541
tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "151643": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "151644": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "151645": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ }
28
+ },
29
+ "additional_special_tokens": [
30
+ "<|im_start|>",
31
+ "<|im_end|>"
32
+ ],
33
+ "bos_token": null,
34
+ "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
35
+ "clean_up_tokenization_spaces": false,
36
+ "eos_token": "<|im_end|>",
37
+ "errors": "replace",
38
+ "model_max_length": 32768,
39
+ "pad_token": "<|im_end|>",
40
+ "split_special_tokens": false,
41
+ "tokenizer_class": "Qwen2Tokenizer",
42
+ "unk_token": null
43
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ffa18fa243cccfbf729510f7d83fcb184f78dfbd7718a3073ec148d996a46094
3
+ size 5713