File size: 9,353 Bytes
714f0a4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 1.0,
  "eval_steps": 500,
  "global_step": 4888,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.020458265139116204,
      "grad_norm": 0.1146482527256012,
      "learning_rate": 6.666666666666667e-06,
      "loss": 2.4366,
      "step": 100
    },
    {
      "epoch": 0.04091653027823241,
      "grad_norm": 0.19526588916778564,
      "learning_rate": 1.3333333333333333e-05,
      "loss": 2.4074,
      "step": 200
    },
    {
      "epoch": 0.06137479541734861,
      "grad_norm": 0.2903444170951843,
      "learning_rate": 2e-05,
      "loss": 2.3776,
      "step": 300
    },
    {
      "epoch": 0.08183306055646482,
      "grad_norm": 0.38961780071258545,
      "learning_rate": 1.9976565632583726e-05,
      "loss": 2.3633,
      "step": 400
    },
    {
      "epoch": 0.10229132569558101,
      "grad_norm": 0.39514774084091187,
      "learning_rate": 1.990637236425014e-05,
      "loss": 2.3275,
      "step": 500
    },
    {
      "epoch": 0.12274959083469722,
      "grad_norm": 0.4908258020877838,
      "learning_rate": 1.9789749181967304e-05,
      "loss": 2.2917,
      "step": 600
    },
    {
      "epoch": 0.1432078559738134,
      "grad_norm": 0.45681577920913696,
      "learning_rate": 1.9627242683835782e-05,
      "loss": 2.2691,
      "step": 700
    },
    {
      "epoch": 0.16366612111292964,
      "grad_norm": 0.514772891998291,
      "learning_rate": 1.9419614517252536e-05,
      "loss": 2.2543,
      "step": 800
    },
    {
      "epoch": 0.18412438625204583,
      "grad_norm": 0.5909072160720825,
      "learning_rate": 1.916783780916589e-05,
      "loss": 2.2484,
      "step": 900
    },
    {
      "epoch": 0.20458265139116202,
      "grad_norm": 0.5185887217521667,
      "learning_rate": 1.8873092605152686e-05,
      "loss": 2.2393,
      "step": 1000
    },
    {
      "epoch": 0.22504091653027825,
      "grad_norm": 0.6383090615272522,
      "learning_rate": 1.8536760338693926e-05,
      "loss": 2.2473,
      "step": 1100
    },
    {
      "epoch": 0.24549918166939444,
      "grad_norm": 0.5739400386810303,
      "learning_rate": 1.816041735657083e-05,
      "loss": 2.2004,
      "step": 1200
    },
    {
      "epoch": 0.26595744680851063,
      "grad_norm": 0.6199432611465454,
      "learning_rate": 1.7745827530726937e-05,
      "loss": 2.2167,
      "step": 1300
    },
    {
      "epoch": 0.2864157119476268,
      "grad_norm": 0.6655313372612,
      "learning_rate": 1.7294933991223413e-05,
      "loss": 2.1977,
      "step": 1400
    },
    {
      "epoch": 0.306873977086743,
      "grad_norm": 0.6417890787124634,
      "learning_rate": 1.6809850019034324e-05,
      "loss": 2.1943,
      "step": 1500
    },
    {
      "epoch": 0.32733224222585927,
      "grad_norm": 0.6394040584564209,
      "learning_rate": 1.6292849141366084e-05,
      "loss": 2.2409,
      "step": 1600
    },
    {
      "epoch": 0.34779050736497547,
      "grad_norm": 0.6412447690963745,
      "learning_rate": 1.574635447592305e-05,
      "loss": 2.2241,
      "step": 1700
    },
    {
      "epoch": 0.36824877250409166,
      "grad_norm": 0.7298178672790527,
      "learning_rate": 1.5172927374061427e-05,
      "loss": 2.1782,
      "step": 1800
    },
    {
      "epoch": 0.38870703764320785,
      "grad_norm": 0.7089855074882507,
      "learning_rate": 1.4575255416059513e-05,
      "loss": 2.2225,
      "step": 1900
    },
    {
      "epoch": 0.40916530278232405,
      "grad_norm": 0.6368885636329651,
      "learning_rate": 1.3956139814768949e-05,
      "loss": 2.1694,
      "step": 2000
    },
    {
      "epoch": 0.42962356792144024,
      "grad_norm": 0.6372247338294983,
      "learning_rate": 1.3318482286684498e-05,
      "loss": 2.1647,
      "step": 2100
    },
    {
      "epoch": 0.4500818330605565,
      "grad_norm": 0.8332213759422302,
      "learning_rate": 1.2665271451965933e-05,
      "loss": 2.1871,
      "step": 2200
    },
    {
      "epoch": 0.4705400981996727,
      "grad_norm": 0.6431373357772827,
      "learning_rate": 1.1999568827153472e-05,
      "loss": 2.1635,
      "step": 2300
    },
    {
      "epoch": 0.4909983633387889,
      "grad_norm": 0.7330695986747742,
      "learning_rate": 1.1324494476227082e-05,
      "loss": 2.1713,
      "step": 2400
    },
    {
      "epoch": 0.5114566284779051,
      "grad_norm": 0.8543004393577576,
      "learning_rate": 1.0643212387261345e-05,
      "loss": 2.1591,
      "step": 2500
    },
    {
      "epoch": 0.5319148936170213,
      "grad_norm": 0.819828987121582,
      "learning_rate": 9.958915643213654e-06,
      "loss": 2.1482,
      "step": 2600
    },
    {
      "epoch": 0.5523731587561375,
      "grad_norm": 0.7319233417510986,
      "learning_rate": 9.274811456348358e-06,
      "loss": 2.1939,
      "step": 2700
    },
    {
      "epoch": 0.5728314238952537,
      "grad_norm": 0.7505283355712891,
      "learning_rate": 8.594106136438665e-06,
      "loss": 2.1661,
      "step": 2800
    },
    {
      "epoch": 0.5932896890343698,
      "grad_norm": 0.8119780421257019,
      "learning_rate": 7.919990063198368e-06,
      "loss": 2.1571,
      "step": 2900
    },
    {
      "epoch": 0.613747954173486,
      "grad_norm": 0.8243106603622437,
      "learning_rate": 7.255622733375776e-06,
      "loss": 2.1802,
      "step": 3000
    },
    {
      "epoch": 0.6342062193126022,
      "grad_norm": 0.7845710515975952,
      "learning_rate": 6.604117952592168e-06,
      "loss": 2.1798,
      "step": 3100
    },
    {
      "epoch": 0.6546644844517185,
      "grad_norm": 0.7707119584083557,
      "learning_rate": 5.968529241328822e-06,
      "loss": 2.1381,
      "step": 3200
    },
    {
      "epoch": 0.6751227495908347,
      "grad_norm": 0.7787159085273743,
      "learning_rate": 5.351835523462808e-06,
      "loss": 2.156,
      "step": 3300
    },
    {
      "epoch": 0.6955810147299509,
      "grad_norm": 0.6944179534912109,
      "learning_rate": 4.756927164427685e-06,
      "loss": 2.163,
      "step": 3400
    },
    {
      "epoch": 0.7160392798690671,
      "grad_norm": 0.8797001838684082,
      "learning_rate": 4.18659242443638e-06,
      "loss": 2.1375,
      "step": 3500
    },
    {
      "epoch": 0.7364975450081833,
      "grad_norm": 0.7624711990356445,
      "learning_rate": 3.6435043902583344e-06,
      "loss": 2.1825,
      "step": 3600
    },
    {
      "epoch": 0.7569558101472995,
      "grad_norm": 0.7933531999588013,
      "learning_rate": 3.1302084468000206e-06,
      "loss": 2.1442,
      "step": 3700
    },
    {
      "epoch": 0.7774140752864157,
      "grad_norm": 0.9495463967323303,
      "learning_rate": 2.6491103472078828e-06,
      "loss": 2.1191,
      "step": 3800
    },
    {
      "epoch": 0.7978723404255319,
      "grad_norm": 1.328906774520874,
      "learning_rate": 2.202464937407752e-06,
      "loss": 2.1698,
      "step": 3900
    },
    {
      "epoch": 0.8183306055646481,
      "grad_norm": 0.9682719111442566,
      "learning_rate": 1.7923655879272395e-06,
      "loss": 2.1254,
      "step": 4000
    },
    {
      "epoch": 0.8387888707037643,
      "grad_norm": 0.7559231519699097,
      "learning_rate": 1.4207343825329167e-06,
      "loss": 2.1739,
      "step": 4100
    },
    {
      "epoch": 0.8592471358428805,
      "grad_norm": 0.8757209777832031,
      "learning_rate": 1.089313109666904e-06,
      "loss": 2.1108,
      "step": 4200
    },
    {
      "epoch": 0.8797054009819967,
      "grad_norm": 0.911118745803833,
      "learning_rate": 7.996550989047813e-07,
      "loss": 2.1683,
      "step": 4300
    },
    {
      "epoch": 0.900163666121113,
      "grad_norm": 0.9142606854438782,
      "learning_rate": 5.531179406964016e-07,
      "loss": 2.1318,
      "step": 4400
    },
    {
      "epoch": 0.9206219312602292,
      "grad_norm": 0.9097318649291992,
      "learning_rate": 3.5085712351121016e-07,
      "loss": 2.1436,
      "step": 4500
    },
    {
      "epoch": 0.9410801963993454,
      "grad_norm": 0.7701767086982727,
      "learning_rate": 1.9382061820997112e-07,
      "loss": 2.1591,
      "step": 4600
    },
    {
      "epoch": 0.9615384615384616,
      "grad_norm": 0.7752079367637634,
      "learning_rate": 8.274443502528817e-08,
      "loss": 2.1662,
      "step": 4700
    },
    {
      "epoch": 0.9819967266775778,
      "grad_norm": 0.7332737445831299,
      "learning_rate": 1.814917397474636e-08,
      "loss": 2.1488,
      "step": 4800
    },
    {
      "epoch": 1.0,
      "step": 4888,
      "total_flos": 8.882163499008e+16,
      "train_loss": 2.2017843461856126,
      "train_runtime": 1517.6787,
      "train_samples_per_second": 6.441,
      "train_steps_per_second": 3.221
    }
  ],
  "logging_steps": 100,
  "max_steps": 4888,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 1,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 8.882163499008e+16,
  "train_batch_size": 2,
  "trial_name": null,
  "trial_params": null
}