File size: 10,975 Bytes
966d7ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.8737651998737894,
  "eval_steps": 500,
  "global_step": 9000,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 9.708502220819883e-05,
      "grad_norm": 96.5,
      "learning_rate": 1.9379844961240311e-07,
      "loss": 4.2495036125183105,
      "memory(GiB)": 112.92,
      "step": 1,
      "token_acc": 0.40126382306477093,
      "train_speed(iter/s)": 0.130363
    },
    {
      "epoch": 0.0248537656852989,
      "grad_norm": 6.84375,
      "learning_rate": 4.96124031007752e-05,
      "loss": 2.0199908088235294,
      "memory(GiB)": 138.16,
      "step": 256,
      "token_acc": 0.5958056756256362,
      "train_speed(iter/s)": 0.135217
    },
    {
      "epoch": 0.0497075313705978,
      "grad_norm": 2.6875,
      "learning_rate": 9.92248062015504e-05,
      "loss": 1.4577598571777344,
      "memory(GiB)": 138.17,
      "step": 512,
      "token_acc": 0.6785023086521644,
      "train_speed(iter/s)": 0.135385
    },
    {
      "epoch": 0.0745612970558967,
      "grad_norm": 2.296875,
      "learning_rate": 9.983643805989245e-05,
      "loss": 1.4217802286148071,
      "memory(GiB)": 138.17,
      "step": 768,
      "token_acc": 0.6849496734466487,
      "train_speed(iter/s)": 0.134581
    },
    {
      "epoch": 0.0994150627411956,
      "grad_norm": 1.8984375,
      "learning_rate": 9.933643638247476e-05,
      "loss": 1.3426355123519897,
      "memory(GiB)": 138.17,
      "step": 1024,
      "token_acc": 0.699673728686982,
      "train_speed(iter/s)": 0.132515
    },
    {
      "epoch": 0.1242688284264945,
      "grad_norm": 1.84375,
      "learning_rate": 9.850332959964666e-05,
      "loss": 1.2444982528686523,
      "memory(GiB)": 138.17,
      "step": 1280,
      "token_acc": 0.7187271993751019,
      "train_speed(iter/s)": 0.133056
    },
    {
      "epoch": 0.1491225941117934,
      "grad_norm": 1.859375,
      "learning_rate": 9.73427426033351e-05,
      "loss": 1.1624003648757935,
      "memory(GiB)": 138.17,
      "step": 1536,
      "token_acc": 0.7339836063834865,
      "train_speed(iter/s)": 0.133627
    },
    {
      "epoch": 0.1739763597970923,
      "grad_norm": 1.7421875,
      "learning_rate": 9.58625113355353e-05,
      "loss": 1.095345377922058,
      "memory(GiB)": 138.17,
      "step": 1792,
      "token_acc": 0.7477444378213578,
      "train_speed(iter/s)": 0.133066
    },
    {
      "epoch": 0.1988301254823912,
      "grad_norm": 1.796875,
      "learning_rate": 9.407262988233549e-05,
      "loss": 1.0396682024002075,
      "memory(GiB)": 138.17,
      "step": 2048,
      "token_acc": 0.7577293820771361,
      "train_speed(iter/s)": 0.132805
    },
    {
      "epoch": 0.2236838911676901,
      "grad_norm": 2.078125,
      "learning_rate": 9.19851829967875e-05,
      "loss": 0.9765125513076782,
      "memory(GiB)": 138.17,
      "step": 2304,
      "token_acc": 0.7712227904219364,
      "train_speed(iter/s)": 0.133242
    },
    {
      "epoch": 0.248537656852989,
      "grad_norm": 1.5703125,
      "learning_rate": 8.961426450620912e-05,
      "loss": 0.920336127281189,
      "memory(GiB)": 138.17,
      "step": 2560,
      "token_acc": 0.7830755957422817,
      "train_speed(iter/s)": 0.133383
    },
    {
      "epoch": 0.2733914225382879,
      "grad_norm": 1.59375,
      "learning_rate": 8.69758821548079e-05,
      "loss": 0.8801365494728088,
      "memory(GiB)": 138.17,
      "step": 2816,
      "token_acc": 0.7913349866408025,
      "train_speed(iter/s)": 0.133433
    },
    {
      "epoch": 0.2982451882235868,
      "grad_norm": 1.1640625,
      "learning_rate": 8.408784952410122e-05,
      "loss": 0.8334779739379883,
      "memory(GiB)": 138.17,
      "step": 3072,
      "token_acc": 0.8006175937055493,
      "train_speed(iter/s)": 0.132889
    },
    {
      "epoch": 0.3230989539088857,
      "grad_norm": 1.6875,
      "learning_rate": 8.096966576085406e-05,
      "loss": 0.7884229421615601,
      "memory(GiB)": 138.17,
      "step": 3328,
      "token_acc": 0.8102322071595001,
      "train_speed(iter/s)": 0.133073
    },
    {
      "epoch": 0.3479527195941846,
      "grad_norm": 1.4921875,
      "learning_rate": 7.764238392457582e-05,
      "loss": 0.7397578954696655,
      "memory(GiB)": 138.17,
      "step": 3584,
      "token_acc": 0.8212528591555482,
      "train_speed(iter/s)": 0.133335
    },
    {
      "epoch": 0.3728064852794835,
      "grad_norm": 2.1875,
      "learning_rate": 7.412846884345582e-05,
      "loss": 0.7087571024894714,
      "memory(GiB)": 138.17,
      "step": 3840,
      "token_acc": 0.8286589691203703,
      "train_speed(iter/s)": 0.133468
    },
    {
      "epoch": 0.3976602509647824,
      "grad_norm": 1.1953125,
      "learning_rate": 7.045164543845158e-05,
      "loss": 0.6600534319877625,
      "memory(GiB)": 138.17,
      "step": 4096,
      "token_acc": 0.8389953998490116,
      "train_speed(iter/s)": 0.133269
    },
    {
      "epoch": 0.4225140166500813,
      "grad_norm": 1.984375,
      "learning_rate": 6.663673853960154e-05,
      "loss": 0.6196721196174622,
      "memory(GiB)": 138.17,
      "step": 4352,
      "token_acc": 0.8484769522886115,
      "train_speed(iter/s)": 0.133418
    },
    {
      "epoch": 0.4473677823353802,
      "grad_norm": 1.203125,
      "learning_rate": 6.270950527607537e-05,
      "loss": 0.5864973068237305,
      "memory(GiB)": 138.17,
      "step": 4608,
      "token_acc": 0.8560292743162837,
      "train_speed(iter/s)": 0.133475
    },
    {
      "epoch": 0.4722215480206791,
      "grad_norm": 1.203125,
      "learning_rate": 5.86964611716145e-05,
      "loss": 0.5385364294052124,
      "memory(GiB)": 138.17,
      "step": 4864,
      "token_acc": 0.86720534525908,
      "train_speed(iter/s)": 0.133422
    },
    {
      "epoch": 0.497075313705978,
      "grad_norm": 1.359375,
      "learning_rate": 5.4624701119515856e-05,
      "loss": 0.49772173166275024,
      "memory(GiB)": 138.17,
      "step": 5120,
      "token_acc": 0.8767477774531491,
      "train_speed(iter/s)": 0.133344
    },
    {
      "epoch": 0.5219290793912769,
      "grad_norm": 1.40625,
      "learning_rate": 5.0521716445882614e-05,
      "loss": 0.46582430601119995,
      "memory(GiB)": 138.17,
      "step": 5376,
      "token_acc": 0.8850724068459814,
      "train_speed(iter/s)": 0.13342
    },
    {
      "epoch": 0.5467828450765758,
      "grad_norm": 1.4140625,
      "learning_rate": 4.64152092962774e-05,
      "loss": 0.4441249668598175,
      "memory(GiB)": 138.17,
      "step": 5632,
      "token_acc": 0.8896486479315997,
      "train_speed(iter/s)": 0.133319
    },
    {
      "epoch": 0.5716366107618747,
      "grad_norm": 1.8203125,
      "learning_rate": 4.2332905598984413e-05,
      "loss": 0.40981537103652954,
      "memory(GiB)": 138.17,
      "step": 5888,
      "token_acc": 0.8990366693094052,
      "train_speed(iter/s)": 0.133434
    },
    {
      "epoch": 0.5964903764471736,
      "grad_norm": 1.5234375,
      "learning_rate": 3.830236786769761e-05,
      "loss": 0.3865773379802704,
      "memory(GiB)": 138.17,
      "step": 6144,
      "token_acc": 0.9034815882027802,
      "train_speed(iter/s)": 0.133189
    },
    {
      "epoch": 0.6213441421324725,
      "grad_norm": 1.03125,
      "learning_rate": 3.4350809107536214e-05,
      "loss": 0.36623093485832214,
      "memory(GiB)": 138.17,
      "step": 6400,
      "token_acc": 0.9082446782242596,
      "train_speed(iter/s)": 0.133233
    },
    {
      "epoch": 0.6461979078177714,
      "grad_norm": 2.203125,
      "learning_rate": 3.0504909080839294e-05,
      "loss": 0.34115397930145264,
      "memory(GiB)": 138.17,
      "step": 6656,
      "token_acc": 0.914435132291292,
      "train_speed(iter/s)": 0.133334
    },
    {
      "epoch": 0.6710516735030703,
      "grad_norm": 1.6015625,
      "learning_rate": 2.6790634173258577e-05,
      "loss": 0.3342404067516327,
      "memory(GiB)": 138.17,
      "step": 6912,
      "token_acc": 0.916860246202295,
      "train_speed(iter/s)": 0.133453
    },
    {
      "epoch": 0.6959054391883692,
      "grad_norm": 1.3515625,
      "learning_rate": 2.323306207636102e-05,
      "loss": 0.3142353296279907,
      "memory(GiB)": 138.17,
      "step": 7168,
      "token_acc": 0.9221213834353058,
      "train_speed(iter/s)": 0.133271
    },
    {
      "epoch": 0.7207592048736681,
      "grad_norm": 2.03125,
      "learning_rate": 1.9856212470432345e-05,
      "loss": 0.30621492862701416,
      "memory(GiB)": 138.17,
      "step": 7424,
      "token_acc": 0.9249788937888913,
      "train_speed(iter/s)": 0.133231
    },
    {
      "epoch": 0.745612970558967,
      "grad_norm": 1.5078125,
      "learning_rate": 1.6682884850661395e-05,
      "loss": 0.2921682596206665,
      "memory(GiB)": 138.17,
      "step": 7680,
      "token_acc": 0.9275989615640866,
      "train_speed(iter/s)": 0.133307
    },
    {
      "epoch": 0.7704667362442659,
      "grad_norm": 1.3671875,
      "learning_rate": 1.3734504591655495e-05,
      "loss": 0.2854159474372864,
      "memory(GiB)": 138.17,
      "step": 7936,
      "token_acc": 0.9285902741314146,
      "train_speed(iter/s)": 0.133282
    },
    {
      "epoch": 0.7953205019295648,
      "grad_norm": 2.0625,
      "learning_rate": 1.1030978289613726e-05,
      "loss": 0.28136610984802246,
      "memory(GiB)": 138.17,
      "step": 8192,
      "token_acc": 0.9303321847535716,
      "train_speed(iter/s)": 0.132989
    },
    {
      "epoch": 0.8201742676148637,
      "grad_norm": 1.28125,
      "learning_rate": 8.590559358845118e-06,
      "loss": 0.2735920548439026,
      "memory(GiB)": 138.17,
      "step": 8448,
      "token_acc": 0.931173780136838,
      "train_speed(iter/s)": 0.13298
    },
    {
      "epoch": 0.8450280333001626,
      "grad_norm": 1.3671875,
      "learning_rate": 6.4297247900848125e-06,
      "loss": 0.2691897451877594,
      "memory(GiB)": 138.17,
      "step": 8704,
      "token_acc": 0.9327428202220522,
      "train_speed(iter/s)": 0.132967
    },
    {
      "epoch": 0.8698817989854615,
      "grad_norm": 1.5234375,
      "learning_rate": 4.563063902699582e-06,
      "loss": 0.26750853657722473,
      "memory(GiB)": 138.17,
      "step": 8960,
      "token_acc": 0.9330804530345964,
      "train_speed(iter/s)": 0.132959
    }
  ],
  "logging_steps": 256,
  "max_steps": 10301,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 1,
  "save_steps": 1000,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 1.1037325459482546e+19,
  "train_batch_size": 2,
  "trial_name": null,
  "trial_params": null
}