Azrail commited on
Commit
295ed29
·
verified ·
1 Parent(s): 1241868

Training in progress, step 65000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a4464ecdd36ba9fcbb768fb530bac7125d2ebc3403bceccb2b1857ab10495094
3
  size 301235464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ad9801ec7b3ea03c8febaf16be0cca903ae6c5e7ba16db1d0ab836be5805c8b
3
  size 301235464
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ca3203148d8c928e5e1184f1534f1177f73f6ec7ebdc7acfe3b62c2af0779f9d
3
  size 602335994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dcaa21f2d1112b5786bb6cb8a7af07df0a486ccdc4e343d067ea09aba3ebc0cf
3
  size 602335994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7f03ef68c121377c551657263f23acf972b60bf546b00ad9803912e5c78e5ecd
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5abe0ab18889dbab668e6d9fae1d62109a3226e616d0e681a91c9a668ea4330
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8a5e9561ab6074dc857170aae9d2b27d70afb0686bb61ba701f52af71ad4d4a9
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b166fab474c8d8470da4ff5d475f9ae65d65d8dd07f0e702e6e8c799bab73616
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.30528161036049467,
6
  "eval_steps": 500,
7
- "global_step": 64000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -11400,11 +11400,189 @@
11400
  "eval_steps_per_second": 23.473,
11401
  "num_input_tokens_seen": 16777211456,
11402
  "step": 64000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11403
  }
11404
  ],
11405
  "logging_steps": 50,
11406
  "max_steps": 70000,
11407
- "num_input_tokens_seen": 16777211456,
11408
  "num_train_epochs": 1,
11409
  "save_steps": 1000,
11410
  "stateful_callbacks": {
@@ -11419,7 +11597,7 @@
11419
  "attributes": {}
11420
  }
11421
  },
11422
- "total_flos": 4.4880678100638106e+18,
11423
  "train_batch_size": 64,
11424
  "trial_name": null,
11425
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.31005163552237736,
6
  "eval_steps": 500,
7
+ "global_step": 65000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
11400
  "eval_steps_per_second": 23.473,
11401
  "num_input_tokens_seen": 16777211456,
11402
  "step": 64000
11403
+ },
11404
+ {
11405
+ "epoch": 0.3055201116185888,
11406
+ "grad_norm": 0.1703004688024521,
11407
+ "learning_rate": 0.00038327731807204744,
11408
+ "loss": 2.5506,
11409
+ "num_input_tokens_seen": 16790318656,
11410
+ "step": 64050
11411
+ },
11412
+ {
11413
+ "epoch": 0.30575861287668293,
11414
+ "grad_norm": 0.19769616425037384,
11415
+ "learning_rate": 0.00037782979693105293,
11416
+ "loss": 2.542,
11417
+ "num_input_tokens_seen": 16803425856,
11418
+ "step": 64100
11419
+ },
11420
+ {
11421
+ "epoch": 0.30599711413477704,
11422
+ "grad_norm": 0.20674961805343628,
11423
+ "learning_rate": 0.00037239765536817873,
11424
+ "loss": 2.539,
11425
+ "num_input_tokens_seen": 16816533056,
11426
+ "step": 64150
11427
+ },
11428
+ {
11429
+ "epoch": 0.3062356153928712,
11430
+ "grad_norm": 0.19121839106082916,
11431
+ "learning_rate": 0.0003669815772166625,
11432
+ "loss": 2.5573,
11433
+ "num_input_tokens_seen": 16829640256,
11434
+ "step": 64200
11435
+ },
11436
+ {
11437
+ "epoch": 0.30647411665096536,
11438
+ "grad_norm": 0.1734025925397873,
11439
+ "learning_rate": 0.00036158224428757535,
11440
+ "loss": 2.5416,
11441
+ "num_input_tokens_seen": 16842747456,
11442
+ "step": 64250
11443
+ },
11444
+ {
11445
+ "epoch": 0.30671261790905946,
11446
+ "grad_norm": 0.1857634037733078,
11447
+ "learning_rate": 0.0003562003362839914,
11448
+ "loss": 2.5652,
11449
+ "num_input_tokens_seen": 16855854656,
11450
+ "step": 64300
11451
+ },
11452
+ {
11453
+ "epoch": 0.3069511191671536,
11454
+ "grad_norm": 0.17733143270015717,
11455
+ "learning_rate": 0.000350836530715422,
11456
+ "loss": 2.5299,
11457
+ "num_input_tokens_seen": 16868961856,
11458
+ "step": 64350
11459
+ },
11460
+ {
11461
+ "epoch": 0.3071896204252477,
11462
+ "grad_norm": 0.18323005735874176,
11463
+ "learning_rate": 0.00034549150281252633,
11464
+ "loss": 2.5691,
11465
+ "num_input_tokens_seen": 16882069056,
11466
+ "step": 64400
11467
+ },
11468
+ {
11469
+ "epoch": 0.3074281216833419,
11470
+ "grad_norm": 0.18570365011692047,
11471
+ "learning_rate": 0.00034016592544210936,
11472
+ "loss": 2.5436,
11473
+ "num_input_tokens_seen": 16895176256,
11474
+ "step": 64450
11475
+ },
11476
+ {
11477
+ "epoch": 0.30766662294143604,
11478
+ "grad_norm": 0.18571798503398895,
11479
+ "learning_rate": 0.00033486046902241664,
11480
+ "loss": 2.5382,
11481
+ "num_input_tokens_seen": 16908283456,
11482
+ "step": 64500
11483
+ },
11484
+ {
11485
+ "epoch": 0.30766662294143604,
11486
+ "eval_loss": 2.4323015213012695,
11487
+ "eval_runtime": 53.7237,
11488
+ "eval_samples_per_second": 93.069,
11489
+ "eval_steps_per_second": 23.267,
11490
+ "num_input_tokens_seen": 16908283456,
11491
+ "step": 64500
11492
+ },
11493
+ {
11494
+ "epoch": 0.30790512419953014,
11495
+ "grad_norm": 0.1829528957605362,
11496
+ "learning_rate": 0.0003295758014387375,
11497
+ "loss": 2.5453,
11498
+ "num_input_tokens_seen": 16921390656,
11499
+ "step": 64550
11500
+ },
11501
+ {
11502
+ "epoch": 0.3081436254576243,
11503
+ "grad_norm": 0.1703086644411087,
11504
+ "learning_rate": 0.0003243125879593286,
11505
+ "loss": 2.5441,
11506
+ "num_input_tokens_seen": 16934497856,
11507
+ "step": 64600
11508
+ },
11509
+ {
11510
+ "epoch": 0.3083821267157184,
11511
+ "grad_norm": 0.17826180160045624,
11512
+ "learning_rate": 0.000319071491151664,
11513
+ "loss": 2.545,
11514
+ "num_input_tokens_seen": 16947605056,
11515
+ "step": 64650
11516
+ },
11517
+ {
11518
+ "epoch": 0.30862062797381257,
11519
+ "grad_norm": 0.17889030277729034,
11520
+ "learning_rate": 0.00031385317079902743,
11521
+ "loss": 2.5405,
11522
+ "num_input_tokens_seen": 16960712256,
11523
+ "step": 64700
11524
+ },
11525
+ {
11526
+ "epoch": 0.30885912923190667,
11527
+ "grad_norm": 0.1711336225271225,
11528
+ "learning_rate": 0.0003086582838174551,
11529
+ "loss": 2.5222,
11530
+ "num_input_tokens_seen": 16973819456,
11531
+ "step": 64750
11532
+ },
11533
+ {
11534
+ "epoch": 0.30909763049000083,
11535
+ "grad_norm": 0.17962214350700378,
11536
+ "learning_rate": 0.0003034874841730382,
11537
+ "loss": 2.5376,
11538
+ "num_input_tokens_seen": 16986926656,
11539
+ "step": 64800
11540
+ },
11541
+ {
11542
+ "epoch": 0.309336131748095,
11543
+ "grad_norm": 0.1699627935886383,
11544
+ "learning_rate": 0.0002983414227995975,
11545
+ "loss": 2.5616,
11546
+ "num_input_tokens_seen": 17000033856,
11547
+ "step": 64850
11548
+ },
11549
+ {
11550
+ "epoch": 0.3095746330061891,
11551
+ "grad_norm": 0.18442535400390625,
11552
+ "learning_rate": 0.00029322074751673977,
11553
+ "loss": 2.5377,
11554
+ "num_input_tokens_seen": 17013141056,
11555
+ "step": 64900
11556
+ },
11557
+ {
11558
+ "epoch": 0.30981313426428325,
11559
+ "grad_norm": 0.17972196638584137,
11560
+ "learning_rate": 0.0002881261029483057,
11561
+ "loss": 2.5474,
11562
+ "num_input_tokens_seen": 17026248256,
11563
+ "step": 64950
11564
+ },
11565
+ {
11566
+ "epoch": 0.31005163552237736,
11567
+ "grad_norm": 0.1810217946767807,
11568
+ "learning_rate": 0.00028305813044122096,
11569
+ "loss": 2.5286,
11570
+ "num_input_tokens_seen": 17039355456,
11571
+ "step": 65000
11572
+ },
11573
+ {
11574
+ "epoch": 0.31005163552237736,
11575
+ "eval_loss": 2.4292306900024414,
11576
+ "eval_runtime": 53.3956,
11577
+ "eval_samples_per_second": 93.641,
11578
+ "eval_steps_per_second": 23.41,
11579
+ "num_input_tokens_seen": 17039355456,
11580
+ "step": 65000
11581
  }
11582
  ],
11583
  "logging_steps": 50,
11584
  "max_steps": 70000,
11585
+ "num_input_tokens_seen": 17039355456,
11586
  "num_train_epochs": 1,
11587
  "save_steps": 1000,
11588
  "stateful_callbacks": {
 
11597
  "attributes": {}
11598
  }
11599
  },
11600
+ "total_flos": 4.5581938885892506e+18,
11601
  "train_batch_size": 64,
11602
  "trial_name": null,
11603
  "trial_params": null