Azrail commited on
Commit
3e40867
·
verified ·
1 Parent(s): 6f9d760

Training in progress, step 2000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3f297721cb03be26e6092587489bee99025182be771eea6e05f77edb6bd34f03
3
  size 517931840
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09e9299c4a411196fea3d5279894585e8d1a7575c08eb1779c5008bb7e4a49b7
3
  size 517931840
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6081c2cece2bea94b0bdedc301e6d7a3c34eef0aaba0899c0043359233ba4468
3
  size 1035661434
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c444730bd04d34d146261d2e799975f1275e5903aace9e152e7e5c01154c912
3
  size 1035661434
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b49390de6348da607ac8b8e7ddf69e26d2c764b165fb5c92780c29f0de564e6c
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e3c7e8305d45d254f0365c29c304654706064d85b369eee2a35f47f258c35c43
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:78e83e408c816bc1f16e9a76bc9d7e37229ddececd6ea07286eb16c742dca118
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e7450dae308a1f566442c67e6e8e15b97c271edd460f95249b85ad7cccbd395
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.02196604324290188,
6
  "eval_steps": 500,
7
- "global_step": 1000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -186,11 +186,189 @@
186
  "eval_steps_per_second": 18.865,
187
  "num_input_tokens_seen": 1048576000,
188
  "step": 1000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
  }
190
  ],
191
  "logging_steps": 50,
192
  "max_steps": 200000,
193
- "num_input_tokens_seen": 1048576000,
194
  "num_train_epochs": 5,
195
  "save_steps": 1000,
196
  "stateful_callbacks": {
@@ -205,7 +383,7 @@
205
  "attributes": {}
206
  }
207
  },
208
- "total_flos": 5.97171715964928e+17,
209
  "train_batch_size": 64,
210
  "trial_name": null,
211
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.04393208648580376,
6
  "eval_steps": 500,
7
+ "global_step": 2000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
186
  "eval_steps_per_second": 18.865,
187
  "num_input_tokens_seen": 1048576000,
188
  "step": 1000
189
+ },
190
+ {
191
+ "epoch": 0.02306434540504697,
192
+ "grad_norm": 0.6067565083503723,
193
+ "learning_rate": 0.00021,
194
+ "loss": 4.5355,
195
+ "num_input_tokens_seen": 1101004800,
196
+ "step": 1050
197
+ },
198
+ {
199
+ "epoch": 0.024162647567192067,
200
+ "grad_norm": 0.6668316721916199,
201
+ "learning_rate": 0.00022,
202
+ "loss": 4.4383,
203
+ "num_input_tokens_seen": 1153433600,
204
+ "step": 1100
205
+ },
206
+ {
207
+ "epoch": 0.02526094972933716,
208
+ "grad_norm": 0.3714616000652313,
209
+ "learning_rate": 0.00023,
210
+ "loss": 4.3538,
211
+ "num_input_tokens_seen": 1205862400,
212
+ "step": 1150
213
+ },
214
+ {
215
+ "epoch": 0.026359251891482256,
216
+ "grad_norm": 0.439012348651886,
217
+ "learning_rate": 0.00024,
218
+ "loss": 4.2848,
219
+ "num_input_tokens_seen": 1258291200,
220
+ "step": 1200
221
+ },
222
+ {
223
+ "epoch": 0.027457554053627348,
224
+ "grad_norm": 0.5026286840438843,
225
+ "learning_rate": 0.00025,
226
+ "loss": 4.2181,
227
+ "num_input_tokens_seen": 1310720000,
228
+ "step": 1250
229
+ },
230
+ {
231
+ "epoch": 0.028555856215772443,
232
+ "grad_norm": 0.4865541160106659,
233
+ "learning_rate": 0.00026000000000000003,
234
+ "loss": 4.1495,
235
+ "num_input_tokens_seen": 1363148800,
236
+ "step": 1300
237
+ },
238
+ {
239
+ "epoch": 0.029654158377917537,
240
+ "grad_norm": 0.5259677767753601,
241
+ "learning_rate": 0.00027,
242
+ "loss": 4.0873,
243
+ "num_input_tokens_seen": 1415577600,
244
+ "step": 1350
245
+ },
246
+ {
247
+ "epoch": 0.030752460540062632,
248
+ "grad_norm": 0.4151704013347626,
249
+ "learning_rate": 0.00028000000000000003,
250
+ "loss": 4.0369,
251
+ "num_input_tokens_seen": 1468006400,
252
+ "step": 1400
253
+ },
254
+ {
255
+ "epoch": 0.03185076270220773,
256
+ "grad_norm": 0.5806245803833008,
257
+ "learning_rate": 0.00029,
258
+ "loss": 3.9881,
259
+ "num_input_tokens_seen": 1520435200,
260
+ "step": 1450
261
+ },
262
+ {
263
+ "epoch": 0.03294906486435282,
264
+ "grad_norm": 0.46140730381011963,
265
+ "learning_rate": 0.0003,
266
+ "loss": 3.9311,
267
+ "num_input_tokens_seen": 1572864000,
268
+ "step": 1500
269
+ },
270
+ {
271
+ "epoch": 0.03294906486435282,
272
+ "eval_loss": 3.8112432956695557,
273
+ "eval_runtime": 65.8947,
274
+ "eval_samples_per_second": 75.879,
275
+ "eval_steps_per_second": 18.97,
276
+ "num_input_tokens_seen": 1572864000,
277
+ "step": 1500
278
+ },
279
+ {
280
+ "epoch": 0.03404736702649791,
281
+ "grad_norm": 0.4219188392162323,
282
+ "learning_rate": 0.00031,
283
+ "loss": 3.8972,
284
+ "num_input_tokens_seen": 1625292800,
285
+ "step": 1550
286
+ },
287
+ {
288
+ "epoch": 0.03514566918864301,
289
+ "grad_norm": 0.3506027162075043,
290
+ "learning_rate": 0.00032,
291
+ "loss": 3.8596,
292
+ "num_input_tokens_seen": 1677721600,
293
+ "step": 1600
294
+ },
295
+ {
296
+ "epoch": 0.0362439713507881,
297
+ "grad_norm": 0.5210819840431213,
298
+ "learning_rate": 0.00033,
299
+ "loss": 3.8182,
300
+ "num_input_tokens_seen": 1730150400,
301
+ "step": 1650
302
+ },
303
+ {
304
+ "epoch": 0.03734227351293319,
305
+ "grad_norm": 0.5830159783363342,
306
+ "learning_rate": 0.00034,
307
+ "loss": 3.7766,
308
+ "num_input_tokens_seen": 1782579200,
309
+ "step": 1700
310
+ },
311
+ {
312
+ "epoch": 0.03844057567507829,
313
+ "grad_norm": 0.4602348804473877,
314
+ "learning_rate": 0.00035,
315
+ "loss": 3.7362,
316
+ "num_input_tokens_seen": 1835008000,
317
+ "step": 1750
318
+ },
319
+ {
320
+ "epoch": 0.03953887783722338,
321
+ "grad_norm": 0.40075036883354187,
322
+ "learning_rate": 0.00035999999999999997,
323
+ "loss": 3.7136,
324
+ "num_input_tokens_seen": 1887436800,
325
+ "step": 1800
326
+ },
327
+ {
328
+ "epoch": 0.04063717999936848,
329
+ "grad_norm": 0.3893415629863739,
330
+ "learning_rate": 0.00037,
331
+ "loss": 3.6809,
332
+ "num_input_tokens_seen": 1939865600,
333
+ "step": 1850
334
+ },
335
+ {
336
+ "epoch": 0.04173548216151357,
337
+ "grad_norm": 0.2921469211578369,
338
+ "learning_rate": 0.00038,
339
+ "loss": 3.6565,
340
+ "num_input_tokens_seen": 1992294400,
341
+ "step": 1900
342
+ },
343
+ {
344
+ "epoch": 0.04283378432365866,
345
+ "grad_norm": 0.49007460474967957,
346
+ "learning_rate": 0.00039000000000000005,
347
+ "loss": 3.6215,
348
+ "num_input_tokens_seen": 2044723200,
349
+ "step": 1950
350
+ },
351
+ {
352
+ "epoch": 0.04393208648580376,
353
+ "grad_norm": 0.2980474531650543,
354
+ "learning_rate": 0.0004,
355
+ "loss": 3.591,
356
+ "num_input_tokens_seen": 2097152000,
357
+ "step": 2000
358
+ },
359
+ {
360
+ "epoch": 0.04393208648580376,
361
+ "eval_loss": 3.4769670963287354,
362
+ "eval_runtime": 62.8853,
363
+ "eval_samples_per_second": 79.51,
364
+ "eval_steps_per_second": 19.877,
365
+ "num_input_tokens_seen": 2097152000,
366
+ "step": 2000
367
  }
368
  ],
369
  "logging_steps": 50,
370
  "max_steps": 200000,
371
+ "num_input_tokens_seen": 2097152000,
372
  "num_train_epochs": 5,
373
  "save_steps": 1000,
374
  "stateful_callbacks": {
 
383
  "attributes": {}
384
  }
385
  },
386
+ "total_flos": 1.194343431929856e+18,
387
  "train_batch_size": 64,
388
  "trial_name": null,
389
  "trial_params": null