irodkin commited on
Commit
39fc809
·
verified ·
1 Parent(s): cfe29a2

Training checkpoint at step 3500

Browse files
Files changed (1) hide show
  1. trainer_state.json +1146 -66
trainer_state.json CHANGED
@@ -1,193 +1,1273 @@
1
  {
2
- "best_global_step": 500,
3
- "best_metric": 2.826472282409668,
4
- "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/google/gemma-3-1b-it/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_34/checkpoint-500",
5
- "epoch": 0.01,
6
  "eval_steps": 100,
7
- "global_step": 500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 0.0005,
14
- "grad_norm": 82.5841699095815,
15
  "learning_rate": 4.8e-08,
16
- "loss": 3.4393,
17
  "step": 25
18
  },
19
  {
20
  "epoch": 0.001,
21
- "grad_norm": 72.33553691687935,
22
  "learning_rate": 9.8e-08,
23
- "loss": 3.401,
24
  "step": 50
25
  },
26
  {
27
  "epoch": 0.0015,
28
- "grad_norm": 55.00272386424627,
29
  "learning_rate": 1.4800000000000003e-07,
30
- "loss": 3.3077,
31
  "step": 75
32
  },
33
  {
34
  "epoch": 0.002,
35
- "grad_norm": 24.67154822301572,
36
  "learning_rate": 1.9800000000000003e-07,
37
- "loss": 3.1946,
38
  "step": 100
39
  },
40
  {
41
  "epoch": 0.002,
42
- "eval_loss": 3.11328125,
43
- "eval_runtime": 39.4175,
44
- "eval_samples_per_second": 2.638,
45
- "eval_steps_per_second": 1.319,
46
  "step": 100
47
  },
48
  {
49
  "epoch": 0.0025,
50
- "grad_norm": 8.681721490029314,
51
  "learning_rate": 2.48e-07,
52
- "loss": 3.0709,
53
  "step": 125
54
  },
55
  {
56
  "epoch": 0.003,
57
- "grad_norm": 7.238581078870377,
58
  "learning_rate": 2.9800000000000005e-07,
59
- "loss": 3.0046,
60
  "step": 150
61
  },
62
  {
63
  "epoch": 0.0035,
64
- "grad_norm": 5.931774986901269,
65
  "learning_rate": 3.48e-07,
66
- "loss": 2.954,
67
  "step": 175
68
  },
69
  {
70
  "epoch": 0.004,
71
- "grad_norm": 4.891205112515998,
72
  "learning_rate": 3.9800000000000004e-07,
73
- "loss": 2.9365,
74
  "step": 200
75
  },
76
  {
77
  "epoch": 0.004,
78
- "eval_loss": 2.922476053237915,
79
- "eval_runtime": 39.6232,
80
- "eval_samples_per_second": 2.625,
81
- "eval_steps_per_second": 1.312,
82
  "step": 200
83
  },
84
  {
85
  "epoch": 0.0045,
86
- "grad_norm": 4.53174674502475,
87
  "learning_rate": 4.4800000000000004e-07,
88
- "loss": 2.9198,
89
  "step": 225
90
  },
91
  {
92
  "epoch": 0.005,
93
- "grad_norm": 4.642185238219915,
94
  "learning_rate": 4.98e-07,
95
- "loss": 2.9004,
96
  "step": 250
97
  },
98
  {
99
  "epoch": 0.0055,
100
- "grad_norm": 5.838003634607987,
101
  "learning_rate": 5.480000000000001e-07,
102
- "loss": 2.8935,
103
  "step": 275
104
  },
105
  {
106
  "epoch": 0.006,
107
- "grad_norm": 4.535236579882751,
108
  "learning_rate": 5.98e-07,
109
- "loss": 2.8857,
110
  "step": 300
111
  },
112
  {
113
  "epoch": 0.006,
114
- "eval_loss": 2.874248743057251,
115
- "eval_runtime": 39.4088,
116
- "eval_samples_per_second": 2.639,
117
- "eval_steps_per_second": 1.32,
118
  "step": 300
119
  },
120
  {
121
  "epoch": 0.0065,
122
- "grad_norm": 4.767460098400186,
123
  "learning_rate": 6.48e-07,
124
- "loss": 2.8672,
125
  "step": 325
126
  },
127
  {
128
  "epoch": 0.007,
129
- "grad_norm": 6.424432953613615,
130
  "learning_rate": 6.98e-07,
131
- "loss": 2.8663,
132
  "step": 350
133
  },
134
  {
135
  "epoch": 0.0075,
136
- "grad_norm": 4.530337576253928,
137
  "learning_rate": 7.480000000000001e-07,
138
- "loss": 2.8574,
139
  "step": 375
140
  },
141
  {
142
  "epoch": 0.008,
143
- "grad_norm": 5.094043051124328,
144
  "learning_rate": 7.98e-07,
145
- "loss": 2.8534,
146
  "step": 400
147
  },
148
  {
149
  "epoch": 0.008,
150
- "eval_loss": 2.846604585647583,
151
- "eval_runtime": 45.9565,
152
- "eval_samples_per_second": 2.263,
153
- "eval_steps_per_second": 1.132,
154
  "step": 400
155
  },
156
  {
157
  "epoch": 0.0085,
158
- "grad_norm": 4.737628412447718,
159
  "learning_rate": 8.480000000000001e-07,
160
- "loss": 2.8303,
161
  "step": 425
162
  },
163
  {
164
  "epoch": 0.009,
165
- "grad_norm": 4.176119045358587,
166
  "learning_rate": 8.980000000000001e-07,
167
- "loss": 2.8403,
168
  "step": 450
169
  },
170
  {
171
  "epoch": 0.0095,
172
- "grad_norm": 4.0039940059315065,
173
  "learning_rate": 9.480000000000001e-07,
174
- "loss": 2.8339,
175
  "step": 475
176
  },
177
  {
178
  "epoch": 0.01,
179
- "grad_norm": 4.202142407976928,
180
  "learning_rate": 9.98e-07,
181
- "loss": 2.831,
182
  "step": 500
183
  },
184
  {
185
  "epoch": 0.01,
186
- "eval_loss": 2.826472282409668,
187
- "eval_runtime": 45.8283,
188
- "eval_samples_per_second": 2.269,
189
- "eval_steps_per_second": 1.135,
190
  "step": 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
  }
192
  ],
193
  "logging_steps": 25,
@@ -207,7 +1287,7 @@
207
  "attributes": {}
208
  }
209
  },
210
- "total_flos": 1.1221094951246889e+18,
211
  "train_batch_size": 1,
212
  "trial_name": null,
213
  "trial_params": null
 
1
  {
2
+ "best_global_step": 3500,
3
+ "best_metric": 2.644831657409668,
4
+ "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/google/gemma-3-1b-it/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_34/checkpoint-3500",
5
+ "epoch": 0.07,
6
  "eval_steps": 100,
7
+ "global_step": 3500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 0.0005,
14
+ "grad_norm": 96.04050869121504,
15
  "learning_rate": 4.8e-08,
16
+ "loss": 3.4391,
17
  "step": 25
18
  },
19
  {
20
  "epoch": 0.001,
21
+ "grad_norm": 78.95958818615539,
22
  "learning_rate": 9.8e-08,
23
+ "loss": 3.397,
24
  "step": 50
25
  },
26
  {
27
  "epoch": 0.0015,
28
+ "grad_norm": 61.45018428703237,
29
  "learning_rate": 1.4800000000000003e-07,
30
+ "loss": 3.297,
31
  "step": 75
32
  },
33
  {
34
  "epoch": 0.002,
35
+ "grad_norm": 22.353651858428393,
36
  "learning_rate": 1.9800000000000003e-07,
37
+ "loss": 3.1733,
38
  "step": 100
39
  },
40
  {
41
  "epoch": 0.002,
42
+ "eval_loss": 3.09375,
43
+ "eval_runtime": 42.6579,
44
+ "eval_samples_per_second": 2.438,
45
+ "eval_steps_per_second": 1.219,
46
  "step": 100
47
  },
48
  {
49
  "epoch": 0.0025,
50
+ "grad_norm": 9.835689068347888,
51
  "learning_rate": 2.48e-07,
52
+ "loss": 3.0557,
53
  "step": 125
54
  },
55
  {
56
  "epoch": 0.003,
57
+ "grad_norm": 8.293191220823632,
58
  "learning_rate": 2.9800000000000005e-07,
59
+ "loss": 2.9954,
60
  "step": 150
61
  },
62
  {
63
  "epoch": 0.0035,
64
+ "grad_norm": 6.660135091710579,
65
  "learning_rate": 3.48e-07,
66
+ "loss": 2.9504,
67
  "step": 175
68
  },
69
  {
70
  "epoch": 0.004,
71
+ "grad_norm": 13.605532098937575,
72
  "learning_rate": 3.9800000000000004e-07,
73
+ "loss": 2.9363,
74
  "step": 200
75
  },
76
  {
77
  "epoch": 0.004,
78
+ "eval_loss": 2.924128532409668,
79
+ "eval_runtime": 42.5415,
80
+ "eval_samples_per_second": 2.445,
81
+ "eval_steps_per_second": 1.222,
82
  "step": 200
83
  },
84
  {
85
  "epoch": 0.0045,
86
+ "grad_norm": 7.7985826788732435,
87
  "learning_rate": 4.4800000000000004e-07,
88
+ "loss": 2.9223,
89
  "step": 225
90
  },
91
  {
92
  "epoch": 0.005,
93
+ "grad_norm": 7.257382344220691,
94
  "learning_rate": 4.98e-07,
95
+ "loss": 2.9043,
96
  "step": 250
97
  },
98
  {
99
  "epoch": 0.0055,
100
+ "grad_norm": 9.049674458422025,
101
  "learning_rate": 5.480000000000001e-07,
102
+ "loss": 2.8984,
103
  "step": 275
104
  },
105
  {
106
  "epoch": 0.006,
107
+ "grad_norm": 5.766079229639856,
108
  "learning_rate": 5.98e-07,
109
+ "loss": 2.8898,
110
  "step": 300
111
  },
112
  {
113
  "epoch": 0.006,
114
+ "eval_loss": 2.877253532409668,
115
+ "eval_runtime": 42.642,
116
+ "eval_samples_per_second": 2.439,
117
+ "eval_steps_per_second": 1.219,
118
  "step": 300
119
  },
120
  {
121
  "epoch": 0.0065,
122
+ "grad_norm": 5.448754520618337,
123
  "learning_rate": 6.48e-07,
124
+ "loss": 2.871,
125
  "step": 325
126
  },
127
  {
128
  "epoch": 0.007,
129
+ "grad_norm": 6.866471472157179,
130
  "learning_rate": 6.98e-07,
131
+ "loss": 2.8693,
132
  "step": 350
133
  },
134
  {
135
  "epoch": 0.0075,
136
+ "grad_norm": 6.115788528016365,
137
  "learning_rate": 7.480000000000001e-07,
138
+ "loss": 2.8601,
139
  "step": 375
140
  },
141
  {
142
  "epoch": 0.008,
143
+ "grad_norm": 5.871468919197367,
144
  "learning_rate": 7.98e-07,
145
+ "loss": 2.8555,
146
  "step": 400
147
  },
148
  {
149
  "epoch": 0.008,
150
+ "eval_loss": 2.848106861114502,
151
+ "eval_runtime": 42.3632,
152
+ "eval_samples_per_second": 2.455,
153
+ "eval_steps_per_second": 1.227,
154
  "step": 400
155
  },
156
  {
157
  "epoch": 0.0085,
158
+ "grad_norm": 6.050804087803095,
159
  "learning_rate": 8.480000000000001e-07,
160
+ "loss": 2.832,
161
  "step": 425
162
  },
163
  {
164
  "epoch": 0.009,
165
+ "grad_norm": 4.634127162302958,
166
  "learning_rate": 8.980000000000001e-07,
167
+ "loss": 2.8418,
168
  "step": 450
169
  },
170
  {
171
  "epoch": 0.0095,
172
+ "grad_norm": 5.700549652048682,
173
  "learning_rate": 9.480000000000001e-07,
174
+ "loss": 2.8351,
175
  "step": 475
176
  },
177
  {
178
  "epoch": 0.01,
179
+ "grad_norm": 5.462019159507559,
180
  "learning_rate": 9.98e-07,
181
+ "loss": 2.8319,
182
  "step": 500
183
  },
184
  {
185
  "epoch": 0.01,
186
+ "eval_loss": 2.828125,
187
+ "eval_runtime": 42.4078,
188
+ "eval_samples_per_second": 2.452,
189
+ "eval_steps_per_second": 1.226,
190
  "step": 500
191
+ },
192
+ {
193
+ "epoch": 0.0105,
194
+ "grad_norm": 5.100237356575638,
195
+ "learning_rate": 1.0480000000000002e-06,
196
+ "loss": 2.8368,
197
+ "step": 525
198
+ },
199
+ {
200
+ "epoch": 0.011,
201
+ "grad_norm": 5.8591675831655134,
202
+ "learning_rate": 1.0980000000000001e-06,
203
+ "loss": 2.8262,
204
+ "step": 550
205
+ },
206
+ {
207
+ "epoch": 0.0115,
208
+ "grad_norm": 4.582188259829454,
209
+ "learning_rate": 1.148e-06,
210
+ "loss": 2.8083,
211
+ "step": 575
212
+ },
213
+ {
214
+ "epoch": 0.012,
215
+ "grad_norm": 4.853482247652135,
216
+ "learning_rate": 1.1980000000000002e-06,
217
+ "loss": 2.8187,
218
+ "step": 600
219
+ },
220
+ {
221
+ "epoch": 0.012,
222
+ "eval_loss": 2.810246467590332,
223
+ "eval_runtime": 42.429,
224
+ "eval_samples_per_second": 2.451,
225
+ "eval_steps_per_second": 1.226,
226
+ "step": 600
227
+ },
228
+ {
229
+ "epoch": 0.0125,
230
+ "grad_norm": 4.813324366644894,
231
+ "learning_rate": 1.248e-06,
232
+ "loss": 2.8109,
233
+ "step": 625
234
+ },
235
+ {
236
+ "epoch": 0.013,
237
+ "grad_norm": 4.680021008982155,
238
+ "learning_rate": 1.2980000000000001e-06,
239
+ "loss": 2.8071,
240
+ "step": 650
241
+ },
242
+ {
243
+ "epoch": 0.0135,
244
+ "grad_norm": 4.232572917961915,
245
+ "learning_rate": 1.348e-06,
246
+ "loss": 2.7996,
247
+ "step": 675
248
+ },
249
+ {
250
+ "epoch": 0.014,
251
+ "grad_norm": 4.140300235345937,
252
+ "learning_rate": 1.3980000000000002e-06,
253
+ "loss": 2.7965,
254
+ "step": 700
255
+ },
256
+ {
257
+ "epoch": 0.014,
258
+ "eval_loss": 2.795973539352417,
259
+ "eval_runtime": 42.2781,
260
+ "eval_samples_per_second": 2.46,
261
+ "eval_steps_per_second": 1.23,
262
+ "step": 700
263
+ },
264
+ {
265
+ "epoch": 0.0145,
266
+ "grad_norm": 4.066322921244863,
267
+ "learning_rate": 1.4480000000000002e-06,
268
+ "loss": 2.7892,
269
+ "step": 725
270
+ },
271
+ {
272
+ "epoch": 0.015,
273
+ "grad_norm": 4.790524346969656,
274
+ "learning_rate": 1.498e-06,
275
+ "loss": 2.7776,
276
+ "step": 750
277
+ },
278
+ {
279
+ "epoch": 0.0155,
280
+ "grad_norm": 4.814208015592297,
281
+ "learning_rate": 1.548e-06,
282
+ "loss": 2.7904,
283
+ "step": 775
284
+ },
285
+ {
286
+ "epoch": 0.016,
287
+ "grad_norm": 3.495397019361677,
288
+ "learning_rate": 1.5980000000000002e-06,
289
+ "loss": 2.7771,
290
+ "step": 800
291
+ },
292
+ {
293
+ "epoch": 0.016,
294
+ "eval_loss": 2.783353328704834,
295
+ "eval_runtime": 45.2475,
296
+ "eval_samples_per_second": 2.298,
297
+ "eval_steps_per_second": 1.149,
298
+ "step": 800
299
+ },
300
+ {
301
+ "epoch": 0.0165,
302
+ "grad_norm": 4.509827964168959,
303
+ "learning_rate": 1.6480000000000001e-06,
304
+ "loss": 2.7864,
305
+ "step": 825
306
+ },
307
+ {
308
+ "epoch": 0.017,
309
+ "grad_norm": 3.396755590212729,
310
+ "learning_rate": 1.6980000000000003e-06,
311
+ "loss": 2.7665,
312
+ "step": 850
313
+ },
314
+ {
315
+ "epoch": 0.0175,
316
+ "grad_norm": 3.6908600934389364,
317
+ "learning_rate": 1.7480000000000002e-06,
318
+ "loss": 2.7784,
319
+ "step": 875
320
+ },
321
+ {
322
+ "epoch": 0.018,
323
+ "grad_norm": 4.517092572588064,
324
+ "learning_rate": 1.798e-06,
325
+ "loss": 2.7718,
326
+ "step": 900
327
+ },
328
+ {
329
+ "epoch": 0.018,
330
+ "eval_loss": 2.772385835647583,
331
+ "eval_runtime": 42.1503,
332
+ "eval_samples_per_second": 2.467,
333
+ "eval_steps_per_second": 1.234,
334
+ "step": 900
335
+ },
336
+ {
337
+ "epoch": 0.0185,
338
+ "grad_norm": 4.1527970820269635,
339
+ "learning_rate": 1.8480000000000001e-06,
340
+ "loss": 2.7592,
341
+ "step": 925
342
+ },
343
+ {
344
+ "epoch": 0.019,
345
+ "grad_norm": 4.093946260210414,
346
+ "learning_rate": 1.898e-06,
347
+ "loss": 2.7728,
348
+ "step": 950
349
+ },
350
+ {
351
+ "epoch": 0.0195,
352
+ "grad_norm": 3.794409923219389,
353
+ "learning_rate": 1.9480000000000002e-06,
354
+ "loss": 2.7757,
355
+ "step": 975
356
+ },
357
+ {
358
+ "epoch": 0.02,
359
+ "grad_norm": 3.128018180220031,
360
+ "learning_rate": 1.998e-06,
361
+ "loss": 2.7614,
362
+ "step": 1000
363
+ },
364
+ {
365
+ "epoch": 0.02,
366
+ "eval_loss": 2.764573335647583,
367
+ "eval_runtime": 42.2226,
368
+ "eval_samples_per_second": 2.463,
369
+ "eval_steps_per_second": 1.232,
370
+ "step": 1000
371
+ },
372
+ {
373
+ "epoch": 0.0205,
374
+ "grad_norm": 3.8078874128993667,
375
+ "learning_rate": 2.048e-06,
376
+ "loss": 2.7629,
377
+ "step": 1025
378
+ },
379
+ {
380
+ "epoch": 0.021,
381
+ "grad_norm": 3.50724949935112,
382
+ "learning_rate": 2.098e-06,
383
+ "loss": 2.776,
384
+ "step": 1050
385
+ },
386
+ {
387
+ "epoch": 0.0215,
388
+ "grad_norm": 3.600343997799952,
389
+ "learning_rate": 2.148e-06,
390
+ "loss": 2.7503,
391
+ "step": 1075
392
+ },
393
+ {
394
+ "epoch": 0.022,
395
+ "grad_norm": 3.4227590286591667,
396
+ "learning_rate": 2.198e-06,
397
+ "loss": 2.7522,
398
+ "step": 1100
399
+ },
400
+ {
401
+ "epoch": 0.022,
402
+ "eval_loss": 2.754957914352417,
403
+ "eval_runtime": 42.1456,
404
+ "eval_samples_per_second": 2.468,
405
+ "eval_steps_per_second": 1.234,
406
+ "step": 1100
407
+ },
408
+ {
409
+ "epoch": 0.0225,
410
+ "grad_norm": 3.6214573340756178,
411
+ "learning_rate": 2.2480000000000003e-06,
412
+ "loss": 2.7423,
413
+ "step": 1125
414
+ },
415
+ {
416
+ "epoch": 0.023,
417
+ "grad_norm": 4.963456774283441,
418
+ "learning_rate": 2.2980000000000003e-06,
419
+ "loss": 2.7473,
420
+ "step": 1150
421
+ },
422
+ {
423
+ "epoch": 0.0235,
424
+ "grad_norm": 4.417511515875024,
425
+ "learning_rate": 2.3480000000000002e-06,
426
+ "loss": 2.7458,
427
+ "step": 1175
428
+ },
429
+ {
430
+ "epoch": 0.024,
431
+ "grad_norm": 3.4640266757488054,
432
+ "learning_rate": 2.398e-06,
433
+ "loss": 2.755,
434
+ "step": 1200
435
+ },
436
+ {
437
+ "epoch": 0.024,
438
+ "eval_loss": 2.744741678237915,
439
+ "eval_runtime": 42.2958,
440
+ "eval_samples_per_second": 2.459,
441
+ "eval_steps_per_second": 1.229,
442
+ "step": 1200
443
+ },
444
+ {
445
+ "epoch": 0.0245,
446
+ "grad_norm": 3.8906187945336637,
447
+ "learning_rate": 2.448e-06,
448
+ "loss": 2.7413,
449
+ "step": 1225
450
+ },
451
+ {
452
+ "epoch": 0.025,
453
+ "grad_norm": 4.103531427287993,
454
+ "learning_rate": 2.498e-06,
455
+ "loss": 2.7464,
456
+ "step": 1250
457
+ },
458
+ {
459
+ "epoch": 0.0255,
460
+ "grad_norm": 3.7381187683762565,
461
+ "learning_rate": 2.5480000000000004e-06,
462
+ "loss": 2.7383,
463
+ "step": 1275
464
+ },
465
+ {
466
+ "epoch": 0.026,
467
+ "grad_norm": 4.019695597142381,
468
+ "learning_rate": 2.598e-06,
469
+ "loss": 2.7286,
470
+ "step": 1300
471
+ },
472
+ {
473
+ "epoch": 0.026,
474
+ "eval_loss": 2.735727071762085,
475
+ "eval_runtime": 42.1778,
476
+ "eval_samples_per_second": 2.466,
477
+ "eval_steps_per_second": 1.233,
478
+ "step": 1300
479
+ },
480
+ {
481
+ "epoch": 0.0265,
482
+ "grad_norm": 3.761754015207239,
483
+ "learning_rate": 2.648e-06,
484
+ "loss": 2.7508,
485
+ "step": 1325
486
+ },
487
+ {
488
+ "epoch": 0.027,
489
+ "grad_norm": 3.5172792845513023,
490
+ "learning_rate": 2.6980000000000003e-06,
491
+ "loss": 2.7396,
492
+ "step": 1350
493
+ },
494
+ {
495
+ "epoch": 0.0275,
496
+ "grad_norm": 3.6926838130981556,
497
+ "learning_rate": 2.748e-06,
498
+ "loss": 2.7286,
499
+ "step": 1375
500
+ },
501
+ {
502
+ "epoch": 0.028,
503
+ "grad_norm": 3.5018547073145,
504
+ "learning_rate": 2.798e-06,
505
+ "loss": 2.7247,
506
+ "step": 1400
507
+ },
508
+ {
509
+ "epoch": 0.028,
510
+ "eval_loss": 2.728515625,
511
+ "eval_runtime": 42.129,
512
+ "eval_samples_per_second": 2.469,
513
+ "eval_steps_per_second": 1.234,
514
+ "step": 1400
515
+ },
516
+ {
517
+ "epoch": 0.0285,
518
+ "grad_norm": 3.575054037567428,
519
+ "learning_rate": 2.848e-06,
520
+ "loss": 2.7229,
521
+ "step": 1425
522
+ },
523
+ {
524
+ "epoch": 0.029,
525
+ "grad_norm": 4.062924067051664,
526
+ "learning_rate": 2.8980000000000005e-06,
527
+ "loss": 2.7208,
528
+ "step": 1450
529
+ },
530
+ {
531
+ "epoch": 0.0295,
532
+ "grad_norm": 3.5741121733868573,
533
+ "learning_rate": 2.9480000000000004e-06,
534
+ "loss": 2.7071,
535
+ "step": 1475
536
+ },
537
+ {
538
+ "epoch": 0.03,
539
+ "grad_norm": 3.9813713940318864,
540
+ "learning_rate": 2.9980000000000003e-06,
541
+ "loss": 2.729,
542
+ "step": 1500
543
+ },
544
+ {
545
+ "epoch": 0.03,
546
+ "eval_loss": 2.721153736114502,
547
+ "eval_runtime": 42.058,
548
+ "eval_samples_per_second": 2.473,
549
+ "eval_steps_per_second": 1.236,
550
+ "step": 1500
551
+ },
552
+ {
553
+ "epoch": 0.0305,
554
+ "grad_norm": 4.465898046671721,
555
+ "learning_rate": 3.0480000000000003e-06,
556
+ "loss": 2.7239,
557
+ "step": 1525
558
+ },
559
+ {
560
+ "epoch": 0.031,
561
+ "grad_norm": 4.083780430751083,
562
+ "learning_rate": 3.0980000000000007e-06,
563
+ "loss": 2.7177,
564
+ "step": 1550
565
+ },
566
+ {
567
+ "epoch": 0.0315,
568
+ "grad_norm": 3.259296223054617,
569
+ "learning_rate": 3.1480000000000006e-06,
570
+ "loss": 2.7149,
571
+ "step": 1575
572
+ },
573
+ {
574
+ "epoch": 0.032,
575
+ "grad_norm": 4.118900376683919,
576
+ "learning_rate": 3.198e-06,
577
+ "loss": 2.7157,
578
+ "step": 1600
579
+ },
580
+ {
581
+ "epoch": 0.032,
582
+ "eval_loss": 2.714693546295166,
583
+ "eval_runtime": 42.155,
584
+ "eval_samples_per_second": 2.467,
585
+ "eval_steps_per_second": 1.234,
586
+ "step": 1600
587
+ },
588
+ {
589
+ "epoch": 0.0325,
590
+ "grad_norm": 3.7685203077928335,
591
+ "learning_rate": 3.248e-06,
592
+ "loss": 2.7185,
593
+ "step": 1625
594
+ },
595
+ {
596
+ "epoch": 0.033,
597
+ "grad_norm": 3.786239665874637,
598
+ "learning_rate": 3.298e-06,
599
+ "loss": 2.694,
600
+ "step": 1650
601
+ },
602
+ {
603
+ "epoch": 0.0335,
604
+ "grad_norm": 4.0202339796786095,
605
+ "learning_rate": 3.348e-06,
606
+ "loss": 2.7076,
607
+ "step": 1675
608
+ },
609
+ {
610
+ "epoch": 0.034,
611
+ "grad_norm": 3.220912468646897,
612
+ "learning_rate": 3.3980000000000003e-06,
613
+ "loss": 2.7086,
614
+ "step": 1700
615
+ },
616
+ {
617
+ "epoch": 0.034,
618
+ "eval_loss": 2.708683967590332,
619
+ "eval_runtime": 42.1812,
620
+ "eval_samples_per_second": 2.466,
621
+ "eval_steps_per_second": 1.233,
622
+ "step": 1700
623
+ },
624
+ {
625
+ "epoch": 0.0345,
626
+ "grad_norm": 3.4236457763643964,
627
+ "learning_rate": 3.4480000000000003e-06,
628
+ "loss": 2.7107,
629
+ "step": 1725
630
+ },
631
+ {
632
+ "epoch": 0.035,
633
+ "grad_norm": 3.428424878937346,
634
+ "learning_rate": 3.4980000000000002e-06,
635
+ "loss": 2.7033,
636
+ "step": 1750
637
+ },
638
+ {
639
+ "epoch": 0.0355,
640
+ "grad_norm": 3.7064590041354597,
641
+ "learning_rate": 3.548e-06,
642
+ "loss": 2.7135,
643
+ "step": 1775
644
+ },
645
+ {
646
+ "epoch": 0.036,
647
+ "grad_norm": 2.6935868617559127,
648
+ "learning_rate": 3.5980000000000005e-06,
649
+ "loss": 2.6977,
650
+ "step": 1800
651
+ },
652
+ {
653
+ "epoch": 0.036,
654
+ "eval_loss": 2.702373743057251,
655
+ "eval_runtime": 42.099,
656
+ "eval_samples_per_second": 2.47,
657
+ "eval_steps_per_second": 1.235,
658
+ "step": 1800
659
+ },
660
+ {
661
+ "epoch": 0.0365,
662
+ "grad_norm": 3.1724624305272577,
663
+ "learning_rate": 3.6480000000000005e-06,
664
+ "loss": 2.6941,
665
+ "step": 1825
666
+ },
667
+ {
668
+ "epoch": 0.037,
669
+ "grad_norm": 3.3947291376692967,
670
+ "learning_rate": 3.6980000000000004e-06,
671
+ "loss": 2.705,
672
+ "step": 1850
673
+ },
674
+ {
675
+ "epoch": 0.0375,
676
+ "grad_norm": 3.2739522130247454,
677
+ "learning_rate": 3.7480000000000004e-06,
678
+ "loss": 2.6971,
679
+ "step": 1875
680
+ },
681
+ {
682
+ "epoch": 0.038,
683
+ "grad_norm": 2.886346941239111,
684
+ "learning_rate": 3.7980000000000007e-06,
685
+ "loss": 2.6878,
686
+ "step": 1900
687
+ },
688
+ {
689
+ "epoch": 0.038,
690
+ "eval_loss": 2.698768138885498,
691
+ "eval_runtime": 42.2524,
692
+ "eval_samples_per_second": 2.461,
693
+ "eval_steps_per_second": 1.231,
694
+ "step": 1900
695
+ },
696
+ {
697
+ "epoch": 0.0385,
698
+ "grad_norm": 2.961130539695273,
699
+ "learning_rate": 3.848e-06,
700
+ "loss": 2.6936,
701
+ "step": 1925
702
+ },
703
+ {
704
+ "epoch": 0.039,
705
+ "grad_norm": 3.2300245788196884,
706
+ "learning_rate": 3.898e-06,
707
+ "loss": 2.6989,
708
+ "step": 1950
709
+ },
710
+ {
711
+ "epoch": 0.0395,
712
+ "grad_norm": 3.2952386418656823,
713
+ "learning_rate": 3.948e-06,
714
+ "loss": 2.6937,
715
+ "step": 1975
716
+ },
717
+ {
718
+ "epoch": 0.04,
719
+ "grad_norm": 2.556435159379079,
720
+ "learning_rate": 3.9980000000000005e-06,
721
+ "loss": 2.6991,
722
+ "step": 2000
723
+ },
724
+ {
725
+ "epoch": 0.04,
726
+ "eval_loss": 2.693058967590332,
727
+ "eval_runtime": 42.2004,
728
+ "eval_samples_per_second": 2.464,
729
+ "eval_steps_per_second": 1.232,
730
+ "step": 2000
731
+ },
732
+ {
733
+ "epoch": 0.0405,
734
+ "grad_norm": 2.975198340671437,
735
+ "learning_rate": 4.048e-06,
736
+ "loss": 2.6896,
737
+ "step": 2025
738
+ },
739
+ {
740
+ "epoch": 0.041,
741
+ "grad_norm": 2.366572300776235,
742
+ "learning_rate": 4.098e-06,
743
+ "loss": 2.6903,
744
+ "step": 2050
745
+ },
746
+ {
747
+ "epoch": 0.0415,
748
+ "grad_norm": 2.650575110326075,
749
+ "learning_rate": 4.148000000000001e-06,
750
+ "loss": 2.6974,
751
+ "step": 2075
752
+ },
753
+ {
754
+ "epoch": 0.042,
755
+ "grad_norm": 2.844363978567716,
756
+ "learning_rate": 4.198e-06,
757
+ "loss": 2.6833,
758
+ "step": 2100
759
+ },
760
+ {
761
+ "epoch": 0.042,
762
+ "eval_loss": 2.687650203704834,
763
+ "eval_runtime": 42.1236,
764
+ "eval_samples_per_second": 2.469,
765
+ "eval_steps_per_second": 1.234,
766
+ "step": 2100
767
+ },
768
+ {
769
+ "epoch": 0.0425,
770
+ "grad_norm": 2.5043519810203425,
771
+ "learning_rate": 4.248000000000001e-06,
772
+ "loss": 2.6848,
773
+ "step": 2125
774
+ },
775
+ {
776
+ "epoch": 0.043,
777
+ "grad_norm": 2.442865859341675,
778
+ "learning_rate": 4.298e-06,
779
+ "loss": 2.6834,
780
+ "step": 2150
781
+ },
782
+ {
783
+ "epoch": 0.0435,
784
+ "grad_norm": 2.396444505850839,
785
+ "learning_rate": 4.3480000000000006e-06,
786
+ "loss": 2.6842,
787
+ "step": 2175
788
+ },
789
+ {
790
+ "epoch": 0.044,
791
+ "grad_norm": 2.467830621762353,
792
+ "learning_rate": 4.398000000000001e-06,
793
+ "loss": 2.6849,
794
+ "step": 2200
795
+ },
796
+ {
797
+ "epoch": 0.044,
798
+ "eval_loss": 2.684495210647583,
799
+ "eval_runtime": 42.337,
800
+ "eval_samples_per_second": 2.456,
801
+ "eval_steps_per_second": 1.228,
802
+ "step": 2200
803
+ },
804
+ {
805
+ "epoch": 0.0445,
806
+ "grad_norm": 2.331183246577976,
807
+ "learning_rate": 4.4480000000000004e-06,
808
+ "loss": 2.6933,
809
+ "step": 2225
810
+ },
811
+ {
812
+ "epoch": 0.045,
813
+ "grad_norm": 2.7108879126095995,
814
+ "learning_rate": 4.498e-06,
815
+ "loss": 2.6756,
816
+ "step": 2250
817
+ },
818
+ {
819
+ "epoch": 0.0455,
820
+ "grad_norm": 2.297487473050839,
821
+ "learning_rate": 4.548e-06,
822
+ "loss": 2.6773,
823
+ "step": 2275
824
+ },
825
+ {
826
+ "epoch": 0.046,
827
+ "grad_norm": 2.260013609826266,
828
+ "learning_rate": 4.598e-06,
829
+ "loss": 2.6869,
830
+ "step": 2300
831
+ },
832
+ {
833
+ "epoch": 0.046,
834
+ "eval_loss": 2.680889368057251,
835
+ "eval_runtime": 42.2308,
836
+ "eval_samples_per_second": 2.463,
837
+ "eval_steps_per_second": 1.231,
838
+ "step": 2300
839
+ },
840
+ {
841
+ "epoch": 0.0465,
842
+ "grad_norm": 2.1362621908829964,
843
+ "learning_rate": 4.648e-06,
844
+ "loss": 2.674,
845
+ "step": 2325
846
+ },
847
+ {
848
+ "epoch": 0.047,
849
+ "grad_norm": 2.530250306266186,
850
+ "learning_rate": 4.698000000000001e-06,
851
+ "loss": 2.6682,
852
+ "step": 2350
853
+ },
854
+ {
855
+ "epoch": 0.0475,
856
+ "grad_norm": 2.284376818082532,
857
+ "learning_rate": 4.748e-06,
858
+ "loss": 2.6741,
859
+ "step": 2375
860
+ },
861
+ {
862
+ "epoch": 0.048,
863
+ "grad_norm": 2.9431781004579403,
864
+ "learning_rate": 4.7980000000000005e-06,
865
+ "loss": 2.6793,
866
+ "step": 2400
867
+ },
868
+ {
869
+ "epoch": 0.048,
870
+ "eval_loss": 2.676382303237915,
871
+ "eval_runtime": 42.1755,
872
+ "eval_samples_per_second": 2.466,
873
+ "eval_steps_per_second": 1.233,
874
+ "step": 2400
875
+ },
876
+ {
877
+ "epoch": 0.0485,
878
+ "grad_norm": 2.2501714313646,
879
+ "learning_rate": 4.848000000000001e-06,
880
+ "loss": 2.6836,
881
+ "step": 2425
882
+ },
883
+ {
884
+ "epoch": 0.049,
885
+ "grad_norm": 2.520507270374293,
886
+ "learning_rate": 4.898e-06,
887
+ "loss": 2.6793,
888
+ "step": 2450
889
+ },
890
+ {
891
+ "epoch": 0.0495,
892
+ "grad_norm": 2.3001609851463156,
893
+ "learning_rate": 4.948000000000001e-06,
894
+ "loss": 2.6825,
895
+ "step": 2475
896
+ },
897
+ {
898
+ "epoch": 0.05,
899
+ "grad_norm": 2.0060268631347973,
900
+ "learning_rate": 4.998e-06,
901
+ "loss": 2.6736,
902
+ "step": 2500
903
+ },
904
+ {
905
+ "epoch": 0.05,
906
+ "eval_loss": 2.671875,
907
+ "eval_runtime": 42.1697,
908
+ "eval_samples_per_second": 2.466,
909
+ "eval_steps_per_second": 1.233,
910
+ "step": 2500
911
+ },
912
+ {
913
+ "epoch": 0.0505,
914
+ "grad_norm": 2.1769919372211564,
915
+ "learning_rate": 5.048000000000001e-06,
916
+ "loss": 2.6741,
917
+ "step": 2525
918
+ },
919
+ {
920
+ "epoch": 0.051,
921
+ "grad_norm": 2.1133782069189366,
922
+ "learning_rate": 5.098000000000001e-06,
923
+ "loss": 2.67,
924
+ "step": 2550
925
+ },
926
+ {
927
+ "epoch": 0.0515,
928
+ "grad_norm": 2.242586565950932,
929
+ "learning_rate": 5.1480000000000005e-06,
930
+ "loss": 2.6835,
931
+ "step": 2575
932
+ },
933
+ {
934
+ "epoch": 0.052,
935
+ "grad_norm": 2.4130154185332615,
936
+ "learning_rate": 5.198000000000001e-06,
937
+ "loss": 2.6752,
938
+ "step": 2600
939
+ },
940
+ {
941
+ "epoch": 0.052,
942
+ "eval_loss": 2.669621467590332,
943
+ "eval_runtime": 42.1123,
944
+ "eval_samples_per_second": 2.47,
945
+ "eval_steps_per_second": 1.235,
946
+ "step": 2600
947
+ },
948
+ {
949
+ "epoch": 0.0525,
950
+ "grad_norm": 2.243339931731786,
951
+ "learning_rate": 5.248000000000001e-06,
952
+ "loss": 2.6631,
953
+ "step": 2625
954
+ },
955
+ {
956
+ "epoch": 0.053,
957
+ "grad_norm": 2.1652170787894964,
958
+ "learning_rate": 5.298000000000001e-06,
959
+ "loss": 2.6653,
960
+ "step": 2650
961
+ },
962
+ {
963
+ "epoch": 0.0535,
964
+ "grad_norm": 2.3514042691010077,
965
+ "learning_rate": 5.348000000000001e-06,
966
+ "loss": 2.6704,
967
+ "step": 2675
968
+ },
969
+ {
970
+ "epoch": 0.054,
971
+ "grad_norm": 2.0555358311645104,
972
+ "learning_rate": 5.398e-06,
973
+ "loss": 2.6744,
974
+ "step": 2700
975
+ },
976
+ {
977
+ "epoch": 0.054,
978
+ "eval_loss": 2.668419361114502,
979
+ "eval_runtime": 42.1636,
980
+ "eval_samples_per_second": 2.467,
981
+ "eval_steps_per_second": 1.233,
982
+ "step": 2700
983
+ },
984
+ {
985
+ "epoch": 0.0545,
986
+ "grad_norm": 2.504233096197935,
987
+ "learning_rate": 5.448e-06,
988
+ "loss": 2.6686,
989
+ "step": 2725
990
+ },
991
+ {
992
+ "epoch": 0.055,
993
+ "grad_norm": 2.1966446495255014,
994
+ "learning_rate": 5.498e-06,
995
+ "loss": 2.6575,
996
+ "step": 2750
997
+ },
998
+ {
999
+ "epoch": 0.0555,
1000
+ "grad_norm": 3.4129666421130738,
1001
+ "learning_rate": 5.548e-06,
1002
+ "loss": 2.6624,
1003
+ "step": 2775
1004
+ },
1005
+ {
1006
+ "epoch": 0.056,
1007
+ "grad_norm": 2.5402178685422028,
1008
+ "learning_rate": 5.5980000000000004e-06,
1009
+ "loss": 2.6615,
1010
+ "step": 2800
1011
+ },
1012
+ {
1013
+ "epoch": 0.056,
1014
+ "eval_loss": 2.666015625,
1015
+ "eval_runtime": 42.1094,
1016
+ "eval_samples_per_second": 2.47,
1017
+ "eval_steps_per_second": 1.235,
1018
+ "step": 2800
1019
+ },
1020
+ {
1021
+ "epoch": 0.0565,
1022
+ "grad_norm": 2.5169534616209215,
1023
+ "learning_rate": 5.648e-06,
1024
+ "loss": 2.6745,
1025
+ "step": 2825
1026
+ },
1027
+ {
1028
+ "epoch": 0.057,
1029
+ "grad_norm": 2.4269096679582347,
1030
+ "learning_rate": 5.698e-06,
1031
+ "loss": 2.658,
1032
+ "step": 2850
1033
+ },
1034
+ {
1035
+ "epoch": 0.0575,
1036
+ "grad_norm": 2.2819396814928763,
1037
+ "learning_rate": 5.748e-06,
1038
+ "loss": 2.6694,
1039
+ "step": 2875
1040
+ },
1041
+ {
1042
+ "epoch": 0.058,
1043
+ "grad_norm": 3.0448163445232512,
1044
+ "learning_rate": 5.798e-06,
1045
+ "loss": 2.6587,
1046
+ "step": 2900
1047
+ },
1048
+ {
1049
+ "epoch": 0.058,
1050
+ "eval_loss": 2.662710428237915,
1051
+ "eval_runtime": 42.173,
1052
+ "eval_samples_per_second": 2.466,
1053
+ "eval_steps_per_second": 1.233,
1054
+ "step": 2900
1055
+ },
1056
+ {
1057
+ "epoch": 0.0585,
1058
+ "grad_norm": 3.2390472506289343,
1059
+ "learning_rate": 5.848000000000001e-06,
1060
+ "loss": 2.661,
1061
+ "step": 2925
1062
+ },
1063
+ {
1064
+ "epoch": 0.059,
1065
+ "grad_norm": 2.5836929915418194,
1066
+ "learning_rate": 5.898e-06,
1067
+ "loss": 2.6514,
1068
+ "step": 2950
1069
+ },
1070
+ {
1071
+ "epoch": 0.0595,
1072
+ "grad_norm": 2.5766876152500227,
1073
+ "learning_rate": 5.9480000000000005e-06,
1074
+ "loss": 2.6673,
1075
+ "step": 2975
1076
+ },
1077
+ {
1078
+ "epoch": 0.06,
1079
+ "grad_norm": 2.507842811667469,
1080
+ "learning_rate": 5.998000000000001e-06,
1081
+ "loss": 2.6658,
1082
+ "step": 3000
1083
+ },
1084
+ {
1085
+ "epoch": 0.06,
1086
+ "eval_loss": 2.659705638885498,
1087
+ "eval_runtime": 42.0906,
1088
+ "eval_samples_per_second": 2.471,
1089
+ "eval_steps_per_second": 1.235,
1090
+ "step": 3000
1091
+ },
1092
+ {
1093
+ "epoch": 0.0605,
1094
+ "grad_norm": 2.291724100817165,
1095
+ "learning_rate": 6.048e-06,
1096
+ "loss": 2.6588,
1097
+ "step": 3025
1098
+ },
1099
+ {
1100
+ "epoch": 0.061,
1101
+ "grad_norm": 2.356775687250912,
1102
+ "learning_rate": 6.098000000000001e-06,
1103
+ "loss": 2.6519,
1104
+ "step": 3050
1105
+ },
1106
+ {
1107
+ "epoch": 0.0615,
1108
+ "grad_norm": 3.6009374683805553,
1109
+ "learning_rate": 6.148e-06,
1110
+ "loss": 2.6581,
1111
+ "step": 3075
1112
+ },
1113
+ {
1114
+ "epoch": 0.062,
1115
+ "grad_norm": 3.2760170273305724,
1116
+ "learning_rate": 6.198000000000001e-06,
1117
+ "loss": 2.6588,
1118
+ "step": 3100
1119
+ },
1120
+ {
1121
+ "epoch": 0.062,
1122
+ "eval_loss": 2.656700611114502,
1123
+ "eval_runtime": 42.0325,
1124
+ "eval_samples_per_second": 2.474,
1125
+ "eval_steps_per_second": 1.237,
1126
+ "step": 3100
1127
+ },
1128
+ {
1129
+ "epoch": 0.0625,
1130
+ "grad_norm": 2.5849236998041825,
1131
+ "learning_rate": 6.248000000000001e-06,
1132
+ "loss": 2.6548,
1133
+ "step": 3125
1134
+ },
1135
+ {
1136
+ "epoch": 0.063,
1137
+ "grad_norm": 2.3095505880624474,
1138
+ "learning_rate": 6.2980000000000005e-06,
1139
+ "loss": 2.6511,
1140
+ "step": 3150
1141
+ },
1142
+ {
1143
+ "epoch": 0.0635,
1144
+ "grad_norm": 2.5258255422234996,
1145
+ "learning_rate": 6.348000000000001e-06,
1146
+ "loss": 2.6589,
1147
+ "step": 3175
1148
+ },
1149
+ {
1150
+ "epoch": 0.064,
1151
+ "grad_norm": 2.3520030773681335,
1152
+ "learning_rate": 6.398000000000001e-06,
1153
+ "loss": 2.6462,
1154
+ "step": 3200
1155
+ },
1156
+ {
1157
+ "epoch": 0.064,
1158
+ "eval_loss": 2.652644157409668,
1159
+ "eval_runtime": 42.2271,
1160
+ "eval_samples_per_second": 2.463,
1161
+ "eval_steps_per_second": 1.231,
1162
+ "step": 3200
1163
+ },
1164
+ {
1165
+ "epoch": 0.0645,
1166
+ "grad_norm": 2.457532178302885,
1167
+ "learning_rate": 6.448000000000001e-06,
1168
+ "loss": 2.6495,
1169
+ "step": 3225
1170
+ },
1171
+ {
1172
+ "epoch": 0.065,
1173
+ "grad_norm": 2.3328730844475833,
1174
+ "learning_rate": 6.498000000000001e-06,
1175
+ "loss": 2.6384,
1176
+ "step": 3250
1177
+ },
1178
+ {
1179
+ "epoch": 0.0655,
1180
+ "grad_norm": 2.382459769400574,
1181
+ "learning_rate": 6.548000000000001e-06,
1182
+ "loss": 2.652,
1183
+ "step": 3275
1184
+ },
1185
+ {
1186
+ "epoch": 0.066,
1187
+ "grad_norm": 2.4287460984943707,
1188
+ "learning_rate": 6.598000000000001e-06,
1189
+ "loss": 2.655,
1190
+ "step": 3300
1191
+ },
1192
+ {
1193
+ "epoch": 0.066,
1194
+ "eval_loss": 2.650841236114502,
1195
+ "eval_runtime": 42.1822,
1196
+ "eval_samples_per_second": 2.465,
1197
+ "eval_steps_per_second": 1.233,
1198
+ "step": 3300
1199
+ },
1200
+ {
1201
+ "epoch": 0.0665,
1202
+ "grad_norm": 3.0374923212376963,
1203
+ "learning_rate": 6.648e-06,
1204
+ "loss": 2.6623,
1205
+ "step": 3325
1206
+ },
1207
+ {
1208
+ "epoch": 0.067,
1209
+ "grad_norm": 2.3072135476674127,
1210
+ "learning_rate": 6.698e-06,
1211
+ "loss": 2.6484,
1212
+ "step": 3350
1213
+ },
1214
+ {
1215
+ "epoch": 0.0675,
1216
+ "grad_norm": 2.3676328206176778,
1217
+ "learning_rate": 6.7480000000000004e-06,
1218
+ "loss": 2.6569,
1219
+ "step": 3375
1220
+ },
1221
+ {
1222
+ "epoch": 0.068,
1223
+ "grad_norm": 2.313390296186245,
1224
+ "learning_rate": 6.798e-06,
1225
+ "loss": 2.6393,
1226
+ "step": 3400
1227
+ },
1228
+ {
1229
+ "epoch": 0.068,
1230
+ "eval_loss": 2.648888111114502,
1231
+ "eval_runtime": 44.6877,
1232
+ "eval_samples_per_second": 2.327,
1233
+ "eval_steps_per_second": 1.164,
1234
+ "step": 3400
1235
+ },
1236
+ {
1237
+ "epoch": 0.0685,
1238
+ "grad_norm": 2.9181668179248033,
1239
+ "learning_rate": 6.848e-06,
1240
+ "loss": 2.6521,
1241
+ "step": 3425
1242
+ },
1243
+ {
1244
+ "epoch": 0.069,
1245
+ "grad_norm": 2.1972242976901457,
1246
+ "learning_rate": 6.898e-06,
1247
+ "loss": 2.6605,
1248
+ "step": 3450
1249
+ },
1250
+ {
1251
+ "epoch": 0.0695,
1252
+ "grad_norm": 2.514104559780915,
1253
+ "learning_rate": 6.948e-06,
1254
+ "loss": 2.6444,
1255
+ "step": 3475
1256
+ },
1257
+ {
1258
+ "epoch": 0.07,
1259
+ "grad_norm": 2.463879404265904,
1260
+ "learning_rate": 6.998000000000001e-06,
1261
+ "loss": 2.6586,
1262
+ "step": 3500
1263
+ },
1264
+ {
1265
+ "epoch": 0.07,
1266
+ "eval_loss": 2.644831657409668,
1267
+ "eval_runtime": 45.1164,
1268
+ "eval_samples_per_second": 2.305,
1269
+ "eval_steps_per_second": 1.153,
1270
+ "step": 3500
1271
  }
1272
  ],
1273
  "logging_steps": 25,
 
1287
  "attributes": {}
1288
  }
1289
  },
1290
+ "total_flos": 7.854767599744188e+18,
1291
  "train_batch_size": 1,
1292
  "trial_name": null,
1293
  "trial_params": null