Azrail commited on
Commit
1f4e6ad
·
verified ·
1 Parent(s): f5f3d48

Training in progress, step 60000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fb24e6d8f2ac2f9fba055776f81932cc139a95f7fc40aa55fb0ec1c2a4f8255a
3
  size 301235464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c3d4eb10327c6f996a0988361f6ad9bbab09e394aba34b1a396d7082da2216c0
3
  size 301235464
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:20de72116e7f03e1795ea16116920f2218782186eb0cf45bda609f4712918191
3
  size 602335994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1da98e221b67155367bda2e5baaef41263bc46b4743e333b4e678859da5c6df
3
  size 602335994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dfe4fcebd5141fdf7604535ed8dc60cda464d7e4d084d78ec5c9b7105325f9b5
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e6a4cb233f004dcf5c1bd7310c625e6acfeb53e49f5aa9a513759dc7631fff0b
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5c63af946e84034ef27ffe1d1d59b07405d72b5713d1851e086bcc930b39f47b
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be823a58640077d89dc450d2caf77b9f9c93851d1d9a6e787b2d5f1c9c9930be
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.281431484551081,
6
  "eval_steps": 500,
7
- "global_step": 59000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -10510,11 +10510,189 @@
10510
  "eval_steps_per_second": 23.434,
10511
  "num_input_tokens_seen": 15466491456,
10512
  "step": 59000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10513
  }
10514
  ],
10515
  "logging_steps": 50,
10516
  "max_steps": 70000,
10517
- "num_input_tokens_seen": 15466491456,
10518
  "num_train_epochs": 1,
10519
  "save_steps": 1000,
10520
  "stateful_callbacks": {
@@ -10529,7 +10707,7 @@
10529
  "attributes": {}
10530
  }
10531
  },
10532
- "total_flos": 4.1374374174366106e+18,
10533
  "train_batch_size": 64,
10534
  "trial_name": null,
10535
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.2862015097129637,
6
  "eval_steps": 500,
7
+ "global_step": 60000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
10510
  "eval_steps_per_second": 23.434,
10511
  "num_input_tokens_seen": 15466491456,
10512
  "step": 59000
10513
+ },
10514
+ {
10515
+ "epoch": 0.2816699858091751,
10516
+ "grad_norm": 0.3338637351989746,
10517
+ "learning_rate": 0.0008873934395068005,
10518
+ "loss": 2.587,
10519
+ "num_input_tokens_seen": 15479598656,
10520
+ "step": 59050
10521
+ },
10522
+ {
10523
+ "epoch": 0.2819084870672693,
10524
+ "grad_norm": 0.20848780870437622,
10525
+ "learning_rate": 0.0008838223701790055,
10526
+ "loss": 2.5989,
10527
+ "num_input_tokens_seen": 15492705856,
10528
+ "step": 59100
10529
+ },
10530
+ {
10531
+ "epoch": 0.2821469883253634,
10532
+ "grad_norm": 0.21479378640651703,
10533
+ "learning_rate": 0.0008802029828000156,
10534
+ "loss": 2.6052,
10535
+ "num_input_tokens_seen": 15505813056,
10536
+ "step": 59150
10537
+ },
10538
+ {
10539
+ "epoch": 0.28238548958345755,
10540
+ "grad_norm": 0.1944151073694229,
10541
+ "learning_rate": 0.0008765357330018055,
10542
+ "loss": 2.6044,
10543
+ "num_input_tokens_seen": 15518920256,
10544
+ "step": 59200
10545
+ },
10546
+ {
10547
+ "epoch": 0.2826239908415517,
10548
+ "grad_norm": 0.2078033685684204,
10549
+ "learning_rate": 0.0008728210824415827,
10550
+ "loss": 2.5929,
10551
+ "num_input_tokens_seen": 15532027456,
10552
+ "step": 59250
10553
+ },
10554
+ {
10555
+ "epoch": 0.2828624920996458,
10556
+ "grad_norm": 0.19340284168720245,
10557
+ "learning_rate": 0.0008690594987436704,
10558
+ "loss": 2.5875,
10559
+ "num_input_tokens_seen": 15545134656,
10560
+ "step": 59300
10561
+ },
10562
+ {
10563
+ "epoch": 0.28310099335773997,
10564
+ "grad_norm": 0.22354012727737427,
10565
+ "learning_rate": 0.0008652514554406388,
10566
+ "loss": 2.5976,
10567
+ "num_input_tokens_seen": 15558241856,
10568
+ "step": 59350
10569
+ },
10570
+ {
10571
+ "epoch": 0.2833394946158341,
10572
+ "grad_norm": 0.26784005761146545,
10573
+ "learning_rate": 0.0008613974319136957,
10574
+ "loss": 2.5868,
10575
+ "num_input_tokens_seen": 15571349056,
10576
+ "step": 59400
10577
+ },
10578
+ {
10579
+ "epoch": 0.28357799587392823,
10580
+ "grad_norm": 0.20749828219413757,
10581
+ "learning_rate": 0.0008574979133323377,
10582
+ "loss": 2.5784,
10583
+ "num_input_tokens_seen": 15584456256,
10584
+ "step": 59450
10585
+ },
10586
+ {
10587
+ "epoch": 0.2838164971320224,
10588
+ "grad_norm": 0.21545729041099548,
10589
+ "learning_rate": 0.0008535533905932737,
10590
+ "loss": 2.5939,
10591
+ "num_input_tokens_seen": 15597563456,
10592
+ "step": 59500
10593
+ },
10594
+ {
10595
+ "epoch": 0.2838164971320224,
10596
+ "eval_loss": 2.469989538192749,
10597
+ "eval_runtime": 54.0784,
10598
+ "eval_samples_per_second": 92.458,
10599
+ "eval_steps_per_second": 23.115,
10600
+ "num_input_tokens_seen": 15597563456,
10601
+ "step": 59500
10602
+ },
10603
+ {
10604
+ "epoch": 0.2840549983901165,
10605
+ "grad_norm": 0.20836423337459564,
10606
+ "learning_rate": 0.0008495643602586287,
10607
+ "loss": 2.5858,
10608
+ "num_input_tokens_seen": 15610670656,
10609
+ "step": 59550
10610
+ },
10611
+ {
10612
+ "epoch": 0.28429349964821066,
10613
+ "grad_norm": 0.20427604019641876,
10614
+ "learning_rate": 0.0008455313244934324,
10615
+ "loss": 2.5781,
10616
+ "num_input_tokens_seen": 15623777856,
10617
+ "step": 59600
10618
+ },
10619
+ {
10620
+ "epoch": 0.28453200090630476,
10621
+ "grad_norm": 0.2341683804988861,
10622
+ "learning_rate": 0.0008414547910024035,
10623
+ "loss": 2.5713,
10624
+ "num_input_tokens_seen": 15636885056,
10625
+ "step": 59650
10626
+ },
10627
+ {
10628
+ "epoch": 0.2847705021643989,
10629
+ "grad_norm": 0.20808522403240204,
10630
+ "learning_rate": 0.0008373352729660373,
10631
+ "loss": 2.5751,
10632
+ "num_input_tokens_seen": 15649992256,
10633
+ "step": 59700
10634
+ },
10635
+ {
10636
+ "epoch": 0.2850090034224931,
10637
+ "grad_norm": 0.21032562851905823,
10638
+ "learning_rate": 0.000833173288976002,
10639
+ "loss": 2.5784,
10640
+ "num_input_tokens_seen": 15663099456,
10641
+ "step": 59750
10642
+ },
10643
+ {
10644
+ "epoch": 0.2852475046805872,
10645
+ "grad_norm": 0.23485584557056427,
10646
+ "learning_rate": 0.0008289693629698564,
10647
+ "loss": 2.5974,
10648
+ "num_input_tokens_seen": 15676206656,
10649
+ "step": 59800
10650
+ },
10651
+ {
10652
+ "epoch": 0.28548600593868134,
10653
+ "grad_norm": 0.2229880541563034,
10654
+ "learning_rate": 0.0008247240241650918,
10655
+ "loss": 2.5834,
10656
+ "num_input_tokens_seen": 15689313856,
10657
+ "step": 59850
10658
+ },
10659
+ {
10660
+ "epoch": 0.28572450719677545,
10661
+ "grad_norm": 0.21837118268013,
10662
+ "learning_rate": 0.000820437806992512,
10663
+ "loss": 2.5734,
10664
+ "num_input_tokens_seen": 15702421056,
10665
+ "step": 59900
10666
+ },
10667
+ {
10668
+ "epoch": 0.2859630084548696,
10669
+ "grad_norm": 0.2157929688692093,
10670
+ "learning_rate": 0.0008161112510289549,
10671
+ "loss": 2.587,
10672
+ "num_input_tokens_seen": 15715528256,
10673
+ "step": 59950
10674
+ },
10675
+ {
10676
+ "epoch": 0.2862015097129637,
10677
+ "grad_norm": 0.24053893983364105,
10678
+ "learning_rate": 0.0008117449009293668,
10679
+ "loss": 2.5853,
10680
+ "num_input_tokens_seen": 15728635456,
10681
+ "step": 60000
10682
+ },
10683
+ {
10684
+ "epoch": 0.2862015097129637,
10685
+ "eval_loss": 2.470459461212158,
10686
+ "eval_runtime": 53.5859,
10687
+ "eval_samples_per_second": 93.308,
10688
+ "eval_steps_per_second": 23.327,
10689
+ "num_input_tokens_seen": 15728635456,
10690
+ "step": 60000
10691
  }
10692
  ],
10693
  "logging_steps": 50,
10694
  "max_steps": 70000,
10695
+ "num_input_tokens_seen": 15728635456,
10696
  "num_train_epochs": 1,
10697
  "save_steps": 1000,
10698
  "stateful_callbacks": {
 
10707
  "attributes": {}
10708
  }
10709
  },
10710
+ "total_flos": 4.2075634959620506e+18,
10711
  "train_batch_size": 64,
10712
  "trial_name": null,
10713
  "trial_params": null