LLJYY commited on
Commit
e9eb123
·
verified ·
1 Parent(s): b3bd4c6

Training in progress, step 1000, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1b3c0e1cfdfa2dffb8d4b7855449842127299042621fb9cb40bf905c5361df76
3
  size 174663600
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd2ce83fb862336b4729018afc8e291aa4c2d38ce3b3b5625756b6a68e191913
3
  size 174663600
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e3e0d22c235547201bf15c44946ffcac705d25f40204672dfcd89d6375fe45e0
3
  size 177908997
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c4a1c0542d95372b9b98cd04e9e19b1d3278913800f038cdb84306140c9e0f5
3
  size 177908997
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:03d5ae5345e6508f9972971a93fe179fa2979377ba4d65fe51dc8554de4896a7
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b03751160f25dd1f7c08604bdbd7711f070d950dfb96d9acede0b0ccf333222
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fad64f681b15ebe2c8020dfe218c4c9cf164f2a3132754563c82640e0ca97129
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9803335b183336349e91b866e4b2332f37cecb5e5bd9cf6a14b120c0067b5d71
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.2362111727884729,
6
  "eval_steps": 500,
7
- "global_step": 500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -208,6 +208,206 @@
208
  "mean_token_accuracy": 0.8954069356620312,
209
  "num_tokens": 8271560.0,
210
  "step": 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
211
  }
212
  ],
213
  "logging_steps": 25,
@@ -227,7 +427,7 @@
227
  "attributes": {}
228
  }
229
  },
230
- "total_flos": 5.283874067165597e+17,
231
  "train_batch_size": 2,
232
  "trial_name": null,
233
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.4724223455769458,
6
  "eval_steps": 500,
7
+ "global_step": 1000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
208
  "mean_token_accuracy": 0.8954069356620312,
209
  "num_tokens": 8271560.0,
210
  "step": 500
211
+ },
212
+ {
213
+ "entropy": 0.3849054877832532,
214
+ "epoch": 0.24802173142789655,
215
+ "grad_norm": 0.298828125,
216
+ "learning_rate": 8.238993710691824e-05,
217
+ "loss": 0.3863,
218
+ "mean_token_accuracy": 0.8975082874298096,
219
+ "num_tokens": 8681455.0,
220
+ "step": 525
221
+ },
222
+ {
223
+ "entropy": 0.3748340607620776,
224
+ "epoch": 0.25983229006732017,
225
+ "grad_norm": 0.26171875,
226
+ "learning_rate": 8.632075471698113e-05,
227
+ "loss": 0.3758,
228
+ "mean_token_accuracy": 0.9003522478044033,
229
+ "num_tokens": 9087910.0,
230
+ "step": 550
231
+ },
232
+ {
233
+ "entropy": 0.37765146313235165,
234
+ "epoch": 0.2716428487067438,
235
+ "grad_norm": 0.2890625,
236
+ "learning_rate": 9.025157232704403e-05,
237
+ "loss": 0.3797,
238
+ "mean_token_accuracy": 0.8989867885410786,
239
+ "num_tokens": 9515141.0,
240
+ "step": 575
241
+ },
242
+ {
243
+ "entropy": 0.3586317488178611,
244
+ "epoch": 0.2834534073461675,
245
+ "grad_norm": 0.310546875,
246
+ "learning_rate": 9.418238993710692e-05,
247
+ "loss": 0.3581,
248
+ "mean_token_accuracy": 0.9045962546765804,
249
+ "num_tokens": 9930880.0,
250
+ "step": 600
251
+ },
252
+ {
253
+ "entropy": 0.352232004404068,
254
+ "epoch": 0.2952639659855911,
255
+ "grad_norm": 0.283203125,
256
+ "learning_rate": 9.811320754716981e-05,
257
+ "loss": 0.3548,
258
+ "mean_token_accuracy": 0.9052738857269287,
259
+ "num_tokens": 10342008.0,
260
+ "step": 625
261
+ },
262
+ {
263
+ "entropy": 0.3356729177199304,
264
+ "epoch": 0.3070745246250148,
265
+ "grad_norm": 0.345703125,
266
+ "learning_rate": 9.99987232903501e-05,
267
+ "loss": 0.3366,
268
+ "mean_token_accuracy": 0.9096107052266598,
269
+ "num_tokens": 10752059.0,
270
+ "step": 650
271
+ },
272
+ {
273
+ "entropy": 0.325265455506742,
274
+ "epoch": 0.31888508326443843,
275
+ "grad_norm": 0.3359375,
276
+ "learning_rate": 9.998909165950179e-05,
277
+ "loss": 0.3265,
278
+ "mean_token_accuracy": 0.9125442025065422,
279
+ "num_tokens": 11171710.0,
280
+ "step": 675
281
+ },
282
+ {
283
+ "entropy": 0.3302194595709443,
284
+ "epoch": 0.330695641903862,
285
+ "grad_norm": 0.275390625,
286
+ "learning_rate": 9.997001907852635e-05,
287
+ "loss": 0.3284,
288
+ "mean_token_accuracy": 0.9123465406894684,
289
+ "num_tokens": 11591328.0,
290
+ "step": 700
291
+ },
292
+ {
293
+ "entropy": 0.3197361998446286,
294
+ "epoch": 0.3425062005432857,
295
+ "grad_norm": 0.333984375,
296
+ "learning_rate": 9.994150914947533e-05,
297
+ "loss": 0.3189,
298
+ "mean_token_accuracy": 0.9138493274152278,
299
+ "num_tokens": 12007445.0,
300
+ "step": 725
301
+ },
302
+ {
303
+ "entropy": 0.30230416195467114,
304
+ "epoch": 0.35431675918270933,
305
+ "grad_norm": 0.29296875,
306
+ "learning_rate": 9.990356725673984e-05,
307
+ "loss": 0.3,
308
+ "mean_token_accuracy": 0.9195481817424297,
309
+ "num_tokens": 12410123.0,
310
+ "step": 750
311
+ },
312
+ {
313
+ "entropy": 0.28528838012367486,
314
+ "epoch": 0.366127317822133,
315
+ "grad_norm": 0.37109375,
316
+ "learning_rate": 9.985620056603348e-05,
317
+ "loss": 0.2833,
318
+ "mean_token_accuracy": 0.9226957756280899,
319
+ "num_tokens": 12817735.0,
320
+ "step": 775
321
+ },
322
+ {
323
+ "entropy": 0.2931415150873363,
324
+ "epoch": 0.37793787646155663,
325
+ "grad_norm": 0.26171875,
326
+ "learning_rate": 9.979941802303922e-05,
327
+ "loss": 0.2925,
328
+ "mean_token_accuracy": 0.9212754264473915,
329
+ "num_tokens": 13239923.0,
330
+ "step": 800
331
+ },
332
+ {
333
+ "entropy": 0.28012413138523695,
334
+ "epoch": 0.3897484351009803,
335
+ "grad_norm": 0.322265625,
336
+ "learning_rate": 9.973323035171982e-05,
337
+ "loss": 0.2757,
338
+ "mean_token_accuracy": 0.9254035331308842,
339
+ "num_tokens": 13656664.0,
340
+ "step": 825
341
+ },
342
+ {
343
+ "entropy": 0.2714027400501072,
344
+ "epoch": 0.40155899374040394,
345
+ "grad_norm": 0.2451171875,
346
+ "learning_rate": 9.965765005229248e-05,
347
+ "loss": 0.2694,
348
+ "mean_token_accuracy": 0.9276439958810806,
349
+ "num_tokens": 14067791.0,
350
+ "step": 850
351
+ },
352
+ {
353
+ "entropy": 0.2834288664162159,
354
+ "epoch": 0.4133695523798276,
355
+ "grad_norm": 0.333984375,
356
+ "learning_rate": 9.957269139886808e-05,
357
+ "loss": 0.2804,
358
+ "mean_token_accuracy": 0.9238070417940617,
359
+ "num_tokens": 14467768.0,
360
+ "step": 875
361
+ },
362
+ {
363
+ "entropy": 0.2719649145565927,
364
+ "epoch": 0.4251801110192512,
365
+ "grad_norm": 0.267578125,
366
+ "learning_rate": 9.947837043675537e-05,
367
+ "loss": 0.2668,
368
+ "mean_token_accuracy": 0.9286511231958866,
369
+ "num_tokens": 14885985.0,
370
+ "step": 900
371
+ },
372
+ {
373
+ "entropy": 0.2586036479100585,
374
+ "epoch": 0.43699066965867484,
375
+ "grad_norm": 0.353515625,
376
+ "learning_rate": 9.937470497943064e-05,
377
+ "loss": 0.255,
378
+ "mean_token_accuracy": 0.931592576354742,
379
+ "num_tokens": 15291855.0,
380
+ "step": 925
381
+ },
382
+ {
383
+ "entropy": 0.2572586870100349,
384
+ "epoch": 0.4488012282980985,
385
+ "grad_norm": 0.302734375,
386
+ "learning_rate": 9.926171460517348e-05,
387
+ "loss": 0.2541,
388
+ "mean_token_accuracy": 0.9317594549059868,
389
+ "num_tokens": 15697297.0,
390
+ "step": 950
391
+ },
392
+ {
393
+ "entropy": 0.2428761958517134,
394
+ "epoch": 0.46061178693752214,
395
+ "grad_norm": 0.287109375,
396
+ "learning_rate": 9.913942065336921e-05,
397
+ "loss": 0.2381,
398
+ "mean_token_accuracy": 0.9357545764744282,
399
+ "num_tokens": 16112052.0,
400
+ "step": 975
401
+ },
402
+ {
403
+ "entropy": 0.2368166291434318,
404
+ "epoch": 0.4724223455769458,
405
+ "grad_norm": 0.296875,
406
+ "learning_rate": 9.90078462204787e-05,
407
+ "loss": 0.2316,
408
+ "mean_token_accuracy": 0.9370592629909515,
409
+ "num_tokens": 16532116.0,
410
+ "step": 1000
411
  }
412
  ],
413
  "logging_steps": 25,
 
427
  "attributes": {}
428
  }
429
  },
430
+ "total_flos": 1.0567860712982623e+18,
431
  "train_batch_size": 2,
432
  "trial_name": null,
433
  "trial_params": null