Azrail commited on
Commit
730f7ae
·
verified ·
1 Parent(s): 1a7adcd

Training in progress, step 116000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f4052b3f6dee6acc6e8461ad996dfa79e27245712edf2d1f3321a44a85660ffc
3
  size 1410301944
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88eb3f94bc7241f618e5c9770b54c115b258d914f67d481780ad17863ab32c2e
3
  size 1410301944
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0922c14e94c809f8792d25d931657f0739836f8872958cc36e36c78337b7886b
3
  size 2820185786
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ead16386e0cfae3ee1c925e0e05a55f093ed2c84207e3beb26950f24f2d0edd3
3
  size 2820185786
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:db41ee9f728a0f615e34c377aa1f203a61ceeaf873404658f962d92e3c5c6285
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:315a996739a8cfadd830b0d25c5fc7336620692744591af847d9b45121986328
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e98ce821f7f40a728bc6b049ace38a924402d0d066809b7215e9faa83ce3c45c
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f9c807b963b46c441b7e935adcacbb554bdd0c85992b7453ee29eed159b81fb
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.0971081722456086,
6
  "eval_steps": 500,
7
- "global_step": 115000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -20478,11 +20478,189 @@
20478
  "eval_steps_per_second": 15.153,
20479
  "num_input_tokens_seen": 60283464768,
20480
  "step": 115000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20481
  }
20482
  ],
20483
  "logging_steps": 50,
20484
  "max_steps": 140000,
20485
- "num_input_tokens_seen": 60283464768,
20486
  "num_train_epochs": 2,
20487
  "save_steps": 1000,
20488
  "stateful_callbacks": {
@@ -20497,7 +20675,7 @@
20497
  "attributes": {}
20498
  }
20499
  },
20500
- "total_flos": 1.0669073693538632e+20,
20501
  "train_batch_size": 32,
20502
  "trial_name": null,
20503
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.1066482225693741,
6
  "eval_steps": 500,
7
+ "global_step": 116000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
20478
  "eval_steps_per_second": 15.153,
20479
  "num_input_tokens_seen": 60283464768,
20480
  "step": 115000
20481
+ },
20482
+ {
20483
+ "epoch": 1.0975851747617968,
20484
+ "grad_norm": 0.14440514147281647,
20485
+ "learning_rate": 0.0009710078129677895,
20486
+ "loss": 2.0927,
20487
+ "num_input_tokens_seen": 60309676352,
20488
+ "step": 115050
20489
+ },
20490
+ {
20491
+ "epoch": 1.098062177277985,
20492
+ "grad_norm": 0.13419468700885773,
20493
+ "learning_rate": 0.0009700591372846095,
20494
+ "loss": 2.0871,
20495
+ "num_input_tokens_seen": 60335889280,
20496
+ "step": 115100
20497
+ },
20498
+ {
20499
+ "epoch": 1.0985391797941735,
20500
+ "grad_norm": 0.14434845745563507,
20501
+ "learning_rate": 0.0009690956679612422,
20502
+ "loss": 2.0823,
20503
+ "num_input_tokens_seen": 60362096256,
20504
+ "step": 115150
20505
+ },
20506
+ {
20507
+ "epoch": 1.0990161823103617,
20508
+ "grad_norm": 0.14158272743225098,
20509
+ "learning_rate": 0.0009681174353198686,
20510
+ "loss": 2.0932,
20511
+ "num_input_tokens_seen": 60388308192,
20512
+ "step": 115200
20513
+ },
20514
+ {
20515
+ "epoch": 1.09949318482655,
20516
+ "grad_norm": 0.1499590128660202,
20517
+ "learning_rate": 0.0009671244701472999,
20518
+ "loss": 2.0901,
20519
+ "num_input_tokens_seen": 60414516160,
20520
+ "step": 115250
20521
+ },
20522
+ {
20523
+ "epoch": 1.0999701873427383,
20524
+ "grad_norm": 0.13877320289611816,
20525
+ "learning_rate": 0.0009661168036940071,
20526
+ "loss": 2.0915,
20527
+ "num_input_tokens_seen": 60440722624,
20528
+ "step": 115300
20529
+ },
20530
+ {
20531
+ "epoch": 1.1004471898589265,
20532
+ "grad_norm": 0.14336808025836945,
20533
+ "learning_rate": 0.0009650944676731382,
20534
+ "loss": 2.0846,
20535
+ "num_input_tokens_seen": 60466923616,
20536
+ "step": 115350
20537
+ },
20538
+ {
20539
+ "epoch": 1.1009241923751147,
20540
+ "grad_norm": 0.16042272746562958,
20541
+ "learning_rate": 0.0009640574942595195,
20542
+ "loss": 2.0942,
20543
+ "num_input_tokens_seen": 60493123456,
20544
+ "step": 115400
20545
+ },
20546
+ {
20547
+ "epoch": 1.101401194891303,
20548
+ "grad_norm": 0.14399364590644836,
20549
+ "learning_rate": 0.0009630059160886439,
20550
+ "loss": 2.0988,
20551
+ "num_input_tokens_seen": 60519323040,
20552
+ "step": 115450
20553
+ },
20554
+ {
20555
+ "epoch": 1.1018781974074914,
20556
+ "grad_norm": 0.14042776823043823,
20557
+ "learning_rate": 0.0009619397662556434,
20558
+ "loss": 2.0916,
20559
+ "num_input_tokens_seen": 60545534656,
20560
+ "step": 115500
20561
+ },
20562
+ {
20563
+ "epoch": 1.1018781974074914,
20564
+ "eval_loss": 2.0105109214782715,
20565
+ "eval_runtime": 82.3145,
20566
+ "eval_samples_per_second": 60.743,
20567
+ "eval_steps_per_second": 15.186,
20568
+ "num_input_tokens_seen": 60545534656,
20569
+ "step": 115500
20570
+ },
20571
+ {
20572
+ "epoch": 1.1023551999236796,
20573
+ "grad_norm": 0.1399744153022766,
20574
+ "learning_rate": 0.000960859078314247,
20575
+ "loss": 2.096,
20576
+ "num_input_tokens_seen": 60571738272,
20577
+ "step": 115550
20578
+ },
20579
+ {
20580
+ "epoch": 1.1028322024398678,
20581
+ "grad_norm": 0.14161787927150726,
20582
+ "learning_rate": 0.0009597638862757254,
20583
+ "loss": 2.0916,
20584
+ "num_input_tokens_seen": 60597952672,
20585
+ "step": 115600
20586
+ },
20587
+ {
20588
+ "epoch": 1.1033092049560562,
20589
+ "grad_norm": 0.14088015258312225,
20590
+ "learning_rate": 0.0009586542246078203,
20591
+ "loss": 2.0856,
20592
+ "num_input_tokens_seen": 60624155648,
20593
+ "step": 115650
20594
+ },
20595
+ {
20596
+ "epoch": 1.1037862074722444,
20597
+ "grad_norm": 0.13098938763141632,
20598
+ "learning_rate": 0.00095753012823366,
20599
+ "loss": 2.0849,
20600
+ "num_input_tokens_seen": 60650370048,
20601
+ "step": 115700
20602
+ },
20603
+ {
20604
+ "epoch": 1.1042632099884326,
20605
+ "grad_norm": 0.14463865756988525,
20606
+ "learning_rate": 0.0009563916325306594,
20607
+ "loss": 2.0918,
20608
+ "num_input_tokens_seen": 60676580928,
20609
+ "step": 115750
20610
+ },
20611
+ {
20612
+ "epoch": 1.104740212504621,
20613
+ "grad_norm": 0.14490677416324615,
20614
+ "learning_rate": 0.000955238773329408,
20615
+ "loss": 2.0996,
20616
+ "num_input_tokens_seen": 60702794432,
20617
+ "step": 115800
20618
+ },
20619
+ {
20620
+ "epoch": 1.1052172150208093,
20621
+ "grad_norm": 0.14372467994689941,
20622
+ "learning_rate": 0.0009540715869125407,
20623
+ "loss": 2.09,
20624
+ "num_input_tokens_seen": 60729000064,
20625
+ "step": 115850
20626
+ },
20627
+ {
20628
+ "epoch": 1.1056942175369975,
20629
+ "grad_norm": 0.16468504071235657,
20630
+ "learning_rate": 0.000952890110013597,
20631
+ "loss": 2.0901,
20632
+ "num_input_tokens_seen": 60755212896,
20633
+ "step": 115900
20634
+ },
20635
+ {
20636
+ "epoch": 1.1061712200531857,
20637
+ "grad_norm": 0.390666663646698,
20638
+ "learning_rate": 0.0009516943798158648,
20639
+ "loss": 2.0855,
20640
+ "num_input_tokens_seen": 60781425984,
20641
+ "step": 115950
20642
+ },
20643
+ {
20644
+ "epoch": 1.1066482225693741,
20645
+ "grad_norm": 0.14308005571365356,
20646
+ "learning_rate": 0.0009504844339512095,
20647
+ "loss": 2.1125,
20648
+ "num_input_tokens_seen": 60807636160,
20649
+ "step": 116000
20650
+ },
20651
+ {
20652
+ "epoch": 1.1066482225693741,
20653
+ "eval_loss": 2.0120937824249268,
20654
+ "eval_runtime": 82.7927,
20655
+ "eval_samples_per_second": 60.392,
20656
+ "eval_steps_per_second": 15.098,
20657
+ "num_input_tokens_seen": 60807636160,
20658
+ "step": 116000
20659
  }
20660
  ],
20661
  "logging_steps": 50,
20662
  "max_steps": 140000,
20663
+ "num_input_tokens_seen": 60807636160,
20664
  "num_train_epochs": 2,
20665
  "save_steps": 1000,
20666
  "stateful_callbacks": {
 
20675
  "attributes": {}
20676
  }
20677
  },
20678
+ "total_flos": 1.0761842469036442e+20,
20679
  "train_batch_size": 32,
20680
  "trial_name": null,
20681
  "trial_params": null