Azrail commited on
Commit
47f1efe
·
verified ·
1 Parent(s): fc23b04

Training in progress, step 59000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1fbbcf8e4efabf5866400ce20d5f64dfe9bcdba3c76105321e75b94424bbdf9a
3
  size 301235464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59a706f60964ffe8cd2b221f9a7465c0f56181a98072bee3057047cce8e408cf
3
  size 301235464
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ee28446b68e061d51e2acb6d49ad965661e91bf2d3291a5dc5003af4c9992cc6
3
  size 602335994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ef2119eabf69c54d09db0a76c3313d847c900937c3e2edb463f3eba3b1000af
3
  size 602335994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ec1bfb0db1c21e8b4cd52af95928aa8366b624cdfe8a7ae4baa053e84325dfb8
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dfe4fcebd5141fdf7604535ed8dc60cda464d7e4d084d78ec5c9b7105325f9b5
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e04abc75ac3354daa3070b9f9eb5e8a95eba4855d092af143aa714bd01a0140a
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e5b084cf754d7494e17fb8efe3747874197d5052ad1bcb013283a3027835137
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.39013693470083055,
6
  "eval_steps": 500,
7
- "global_step": 58000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -10332,11 +10332,189 @@
10332
  "eval_steps_per_second": 23.346,
10333
  "num_input_tokens_seen": 15204352000,
10334
  "step": 58000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10335
  }
10336
  ],
10337
  "logging_steps": 50,
10338
  "max_steps": 60000,
10339
- "num_input_tokens_seen": 15204352000,
10340
  "num_train_epochs": 1,
10341
  "save_steps": 1000,
10342
  "stateful_callbacks": {
@@ -10351,7 +10529,7 @@
10351
  "attributes": {}
10352
  }
10353
  },
10354
- "total_flos": 4.06731255447552e+18,
10355
  "train_batch_size": 64,
10356
  "trial_name": null,
10357
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.3968634335749828,
6
  "eval_steps": 500,
7
+ "global_step": 59000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
10332
  "eval_steps_per_second": 23.346,
10333
  "num_input_tokens_seen": 15204352000,
10334
  "step": 58000
10335
+ },
10336
+ {
10337
+ "epoch": 0.39047325964453816,
10338
+ "grad_norm": 0.1577194780111313,
10339
+ "learning_rate": 0.00023875071764202561,
10340
+ "loss": 2.9866,
10341
+ "num_input_tokens_seen": 15217459200,
10342
+ "step": 58050
10343
+ },
10344
+ {
10345
+ "epoch": 0.3908095845882458,
10346
+ "grad_norm": 0.1869671791791916,
10347
+ "learning_rate": 0.00022768048249248646,
10348
+ "loss": 2.9973,
10349
+ "num_input_tokens_seen": 15230566400,
10350
+ "step": 58100
10351
+ },
10352
+ {
10353
+ "epoch": 0.3911459095319534,
10354
+ "grad_norm": 0.1568073183298111,
10355
+ "learning_rate": 0.0002167968815375837,
10356
+ "loss": 3.0012,
10357
+ "num_input_tokens_seen": 15243673600,
10358
+ "step": 58150
10359
+ },
10360
+ {
10361
+ "epoch": 0.391482234475661,
10362
+ "grad_norm": 0.15343065559864044,
10363
+ "learning_rate": 0.00020610737385376348,
10364
+ "loss": 2.988,
10365
+ "num_input_tokens_seen": 15256780800,
10366
+ "step": 58200
10367
+ },
10368
+ {
10369
+ "epoch": 0.3918185594193686,
10370
+ "grad_norm": 0.22413235902786255,
10371
+ "learning_rate": 0.00019561928549563967,
10372
+ "loss": 2.993,
10373
+ "num_input_tokens_seen": 15269888000,
10374
+ "step": 58250
10375
+ },
10376
+ {
10377
+ "epoch": 0.3921548843630762,
10378
+ "grad_norm": 0.1807044893503189,
10379
+ "learning_rate": 0.00018533980447508135,
10380
+ "loss": 2.9905,
10381
+ "num_input_tokens_seen": 15282995200,
10382
+ "step": 58300
10383
+ },
10384
+ {
10385
+ "epoch": 0.39249120930678383,
10386
+ "grad_norm": 0.1571112871170044,
10387
+ "learning_rate": 0.00017527597583490823,
10388
+ "loss": 2.9983,
10389
+ "num_input_tokens_seen": 15296102400,
10390
+ "step": 58350
10391
+ },
10392
+ {
10393
+ "epoch": 0.39282753425049144,
10394
+ "grad_norm": 0.16821637749671936,
10395
+ "learning_rate": 0.00016543469682057105,
10396
+ "loss": 2.9966,
10397
+ "num_input_tokens_seen": 15309209600,
10398
+ "step": 58400
10399
+ },
10400
+ {
10401
+ "epoch": 0.39316385919419905,
10402
+ "grad_norm": 0.1497010737657547,
10403
+ "learning_rate": 0.00015582271215312294,
10404
+ "loss": 2.9814,
10405
+ "num_input_tokens_seen": 15322316800,
10406
+ "step": 58450
10407
+ },
10408
+ {
10409
+ "epoch": 0.39350018413790666,
10410
+ "grad_norm": 0.15679225325584412,
10411
+ "learning_rate": 0.00014644660940672628,
10412
+ "loss": 2.9876,
10413
+ "num_input_tokens_seen": 15335424000,
10414
+ "step": 58500
10415
+ },
10416
+ {
10417
+ "epoch": 0.39350018413790666,
10418
+ "eval_loss": 2.8887994289398193,
10419
+ "eval_runtime": 53.8449,
10420
+ "eval_samples_per_second": 92.859,
10421
+ "eval_steps_per_second": 23.215,
10422
+ "num_input_tokens_seen": 15335424000,
10423
+ "step": 58500
10424
+ },
10425
+ {
10426
+ "epoch": 0.39383650908161427,
10427
+ "grad_norm": 0.15169823169708252,
10428
+ "learning_rate": 0.0001373128144938563,
10429
+ "loss": 2.9875,
10430
+ "num_input_tokens_seen": 15348531200,
10431
+ "step": 58550
10432
+ },
10433
+ {
10434
+ "epoch": 0.3941728340253219,
10435
+ "grad_norm": 0.1635347604751587,
10436
+ "learning_rate": 0.00012842758726130281,
10437
+ "loss": 2.9898,
10438
+ "num_input_tokens_seen": 15361638400,
10439
+ "step": 58600
10440
+ },
10441
+ {
10442
+ "epoch": 0.3945091589690295,
10443
+ "grad_norm": 0.15156348049640656,
10444
+ "learning_rate": 0.00011979701719998454,
10445
+ "loss": 2.9977,
10446
+ "num_input_tokens_seen": 15374745600,
10447
+ "step": 58650
10448
+ },
10449
+ {
10450
+ "epoch": 0.3948454839127371,
10451
+ "grad_norm": 0.15710316598415375,
10452
+ "learning_rate": 0.00011142701927151455,
10453
+ "loss": 2.981,
10454
+ "num_input_tokens_seen": 15387852800,
10455
+ "step": 58700
10456
+ },
10457
+ {
10458
+ "epoch": 0.3951818088564447,
10459
+ "grad_norm": 0.2838917374610901,
10460
+ "learning_rate": 0.00010332332985438247,
10461
+ "loss": 2.9909,
10462
+ "num_input_tokens_seen": 15400960000,
10463
+ "step": 58750
10464
+ },
10465
+ {
10466
+ "epoch": 0.3955181338001524,
10467
+ "grad_norm": 0.1509639173746109,
10468
+ "learning_rate": 9.549150281252633e-05,
10469
+ "loss": 2.9851,
10470
+ "num_input_tokens_seen": 15414067200,
10471
+ "step": 58800
10472
+ },
10473
+ {
10474
+ "epoch": 0.39585445874386,
10475
+ "grad_norm": 0.1501421183347702,
10476
+ "learning_rate": 8.793690568899215e-05,
10477
+ "loss": 2.9931,
10478
+ "num_input_tokens_seen": 15427174400,
10479
+ "step": 58850
10480
+ },
10481
+ {
10482
+ "epoch": 0.3961907836875676,
10483
+ "grad_norm": 0.14904147386550903,
10484
+ "learning_rate": 8.066471602728804e-05,
10485
+ "loss": 2.9862,
10486
+ "num_input_tokens_seen": 15440281600,
10487
+ "step": 58900
10488
+ },
10489
+ {
10490
+ "epoch": 0.3965271086312752,
10491
+ "grad_norm": 0.15182824432849884,
10492
+ "learning_rate": 7.367991782295391e-05,
10493
+ "loss": 2.9882,
10494
+ "num_input_tokens_seen": 15453388800,
10495
+ "step": 58950
10496
+ },
10497
+ {
10498
+ "epoch": 0.3968634335749828,
10499
+ "grad_norm": 0.14710576832294464,
10500
+ "learning_rate": 6.698729810778065e-05,
10501
+ "loss": 2.9856,
10502
+ "num_input_tokens_seen": 15466496000,
10503
+ "step": 59000
10504
+ },
10505
+ {
10506
+ "epoch": 0.3968634335749828,
10507
+ "eval_loss": 2.8845956325531006,
10508
+ "eval_runtime": 53.5429,
10509
+ "eval_samples_per_second": 93.383,
10510
+ "eval_steps_per_second": 23.346,
10511
+ "num_input_tokens_seen": 15466496000,
10512
+ "step": 59000
10513
  }
10514
  ],
10515
  "logging_steps": 50,
10516
  "max_steps": 60000,
10517
+ "num_input_tokens_seen": 15466496000,
10518
  "num_train_epochs": 1,
10519
  "save_steps": 1000,
10520
  "stateful_callbacks": {
 
10529
  "attributes": {}
10530
  }
10531
  },
10532
+ "total_flos": 4.13743863300096e+18,
10533
  "train_batch_size": 64,
10534
  "trial_name": null,
10535
  "trial_params": null