Azrail commited on
Commit
26fa5fb
·
verified ·
1 Parent(s): 8fce9b0

Training in progress, step 43000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b793c31018c10b83151888a761e5fecf881d8cfcf10fe82ad108fb7a30b9cb35
3
  size 301235464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ef4495c71186600e4deb9626160177c8fff186d1b83ba3e101354820ff0b557
3
  size 301235464
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c753061fb3a47402b7408e67c6f3761fca04d13fb94ac46b9adfdfc16d0184d4
3
  size 602335994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4fe7eac54f364f5be220dedbdbb5b62a67232200bda7c79c78a104963651e13
3
  size 602335994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9aaf95bbf390f32ec661a712de605a0c816388cfa815f81914058fe6bdabdcd9
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37df8b5d43f22ad1aaa4d7dfd1f99c1668bea9e213ed7e601e62de46919c3f7c
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a94a7467707318fda39e274661a096a9de559314c283be40d75a871d8d1d3d18
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c21038d5c74dc9feef98b9cc841f29561ac202ab70974b8a5e9d4e813a417597
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.2003410567990746,
6
  "eval_steps": 500,
7
- "global_step": 42000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -7484,11 +7484,189 @@
7484
  "eval_steps_per_second": 24.246,
7485
  "num_input_tokens_seen": 11010043456,
7486
  "step": 42000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7487
  }
7488
  ],
7489
  "logging_steps": 50,
7490
  "max_steps": 70000,
7491
- "num_input_tokens_seen": 11010043456,
7492
  "num_train_epochs": 1,
7493
  "save_steps": 1000,
7494
  "stateful_callbacks": {
@@ -7503,7 +7681,7 @@
7503
  "attributes": {}
7504
  }
7505
  },
7506
- "total_flos": 2.9452940825041306e+18,
7507
  "train_batch_size": 64,
7508
  "trial_name": null,
7509
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.20511108196095734,
6
  "eval_steps": 500,
7
+ "global_step": 43000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
7484
  "eval_steps_per_second": 24.246,
7485
  "num_input_tokens_seen": 11010043456,
7486
  "step": 42000
7487
+ },
7488
+ {
7489
+ "epoch": 0.20057955805716876,
7490
+ "grad_norm": 0.17952106893062592,
7491
+ "learning_rate": 0.001,
7492
+ "loss": 2.6165,
7493
+ "num_input_tokens_seen": 11023150656,
7494
+ "step": 42050
7495
+ },
7496
+ {
7497
+ "epoch": 0.2008180593152629,
7498
+ "grad_norm": 0.20292694866657257,
7499
+ "learning_rate": 0.001,
7500
+ "loss": 2.6357,
7501
+ "num_input_tokens_seen": 11036257856,
7502
+ "step": 42100
7503
+ },
7504
+ {
7505
+ "epoch": 0.20105656057335702,
7506
+ "grad_norm": 0.19588933885097504,
7507
+ "learning_rate": 0.001,
7508
+ "loss": 2.6102,
7509
+ "num_input_tokens_seen": 11049365056,
7510
+ "step": 42150
7511
+ },
7512
+ {
7513
+ "epoch": 0.20129506183145116,
7514
+ "grad_norm": 0.1982785314321518,
7515
+ "learning_rate": 0.001,
7516
+ "loss": 2.6019,
7517
+ "num_input_tokens_seen": 11062472256,
7518
+ "step": 42200
7519
+ },
7520
+ {
7521
+ "epoch": 0.2015335630895453,
7522
+ "grad_norm": 0.18049876391887665,
7523
+ "learning_rate": 0.001,
7524
+ "loss": 2.6081,
7525
+ "num_input_tokens_seen": 11075579456,
7526
+ "step": 42250
7527
+ },
7528
+ {
7529
+ "epoch": 0.20177206434763945,
7530
+ "grad_norm": 0.2069908082485199,
7531
+ "learning_rate": 0.001,
7532
+ "loss": 2.6173,
7533
+ "num_input_tokens_seen": 11088686656,
7534
+ "step": 42300
7535
+ },
7536
+ {
7537
+ "epoch": 0.20201056560573358,
7538
+ "grad_norm": 0.2415982335805893,
7539
+ "learning_rate": 0.001,
7540
+ "loss": 2.6173,
7541
+ "num_input_tokens_seen": 11101793856,
7542
+ "step": 42350
7543
+ },
7544
+ {
7545
+ "epoch": 0.2022490668638277,
7546
+ "grad_norm": 0.20267252624034882,
7547
+ "learning_rate": 0.001,
7548
+ "loss": 2.6299,
7549
+ "num_input_tokens_seen": 11114901056,
7550
+ "step": 42400
7551
+ },
7552
+ {
7553
+ "epoch": 0.20248756812192184,
7554
+ "grad_norm": 0.20683065056800842,
7555
+ "learning_rate": 0.001,
7556
+ "loss": 2.6282,
7557
+ "num_input_tokens_seen": 11128008256,
7558
+ "step": 42450
7559
+ },
7560
+ {
7561
+ "epoch": 0.20272606938001597,
7562
+ "grad_norm": 0.22137881815433502,
7563
+ "learning_rate": 0.001,
7564
+ "loss": 2.6271,
7565
+ "num_input_tokens_seen": 11141115456,
7566
+ "step": 42500
7567
+ },
7568
+ {
7569
+ "epoch": 0.20272606938001597,
7570
+ "eval_loss": 2.5125572681427,
7571
+ "eval_runtime": 51.794,
7572
+ "eval_samples_per_second": 96.536,
7573
+ "eval_steps_per_second": 24.134,
7574
+ "num_input_tokens_seen": 11141115456,
7575
+ "step": 42500
7576
+ },
7577
+ {
7578
+ "epoch": 0.2029645706381101,
7579
+ "grad_norm": 0.20610037446022034,
7580
+ "learning_rate": 0.001,
7581
+ "loss": 2.6255,
7582
+ "num_input_tokens_seen": 11154222656,
7583
+ "step": 42550
7584
+ },
7585
+ {
7586
+ "epoch": 0.20320307189620426,
7587
+ "grad_norm": 0.21218810975551605,
7588
+ "learning_rate": 0.001,
7589
+ "loss": 2.6149,
7590
+ "num_input_tokens_seen": 11167329856,
7591
+ "step": 42600
7592
+ },
7593
+ {
7594
+ "epoch": 0.2034415731542984,
7595
+ "grad_norm": 0.19685466587543488,
7596
+ "learning_rate": 0.001,
7597
+ "loss": 2.6208,
7598
+ "num_input_tokens_seen": 11180437056,
7599
+ "step": 42650
7600
+ },
7601
+ {
7602
+ "epoch": 0.20368007441239253,
7603
+ "grad_norm": 0.20507460832595825,
7604
+ "learning_rate": 0.001,
7605
+ "loss": 2.6227,
7606
+ "num_input_tokens_seen": 11193544256,
7607
+ "step": 42700
7608
+ },
7609
+ {
7610
+ "epoch": 0.20391857567048666,
7611
+ "grad_norm": 0.20014505088329315,
7612
+ "learning_rate": 0.001,
7613
+ "loss": 2.6238,
7614
+ "num_input_tokens_seen": 11206651456,
7615
+ "step": 42750
7616
+ },
7617
+ {
7618
+ "epoch": 0.2041570769285808,
7619
+ "grad_norm": 0.1907282918691635,
7620
+ "learning_rate": 0.001,
7621
+ "loss": 2.6157,
7622
+ "num_input_tokens_seen": 11219758656,
7623
+ "step": 42800
7624
+ },
7625
+ {
7626
+ "epoch": 0.20439557818667495,
7627
+ "grad_norm": 0.18553833663463593,
7628
+ "learning_rate": 0.001,
7629
+ "loss": 2.6123,
7630
+ "num_input_tokens_seen": 11232865856,
7631
+ "step": 42850
7632
+ },
7633
+ {
7634
+ "epoch": 0.20463407944476908,
7635
+ "grad_norm": 0.20382866263389587,
7636
+ "learning_rate": 0.001,
7637
+ "loss": 2.6163,
7638
+ "num_input_tokens_seen": 11245973056,
7639
+ "step": 42900
7640
+ },
7641
+ {
7642
+ "epoch": 0.2048725807028632,
7643
+ "grad_norm": 0.18923860788345337,
7644
+ "learning_rate": 0.001,
7645
+ "loss": 2.5981,
7646
+ "num_input_tokens_seen": 11259080256,
7647
+ "step": 42950
7648
+ },
7649
+ {
7650
+ "epoch": 0.20511108196095734,
7651
+ "grad_norm": 0.19230851531028748,
7652
+ "learning_rate": 0.001,
7653
+ "loss": 2.618,
7654
+ "num_input_tokens_seen": 11272187456,
7655
+ "step": 43000
7656
+ },
7657
+ {
7658
+ "epoch": 0.20511108196095734,
7659
+ "eval_loss": 2.5047237873077393,
7660
+ "eval_runtime": 51.2959,
7661
+ "eval_samples_per_second": 97.474,
7662
+ "eval_steps_per_second": 24.368,
7663
+ "num_input_tokens_seen": 11272187456,
7664
+ "step": 43000
7665
  }
7666
  ],
7667
  "logging_steps": 50,
7668
  "max_steps": 70000,
7669
+ "num_input_tokens_seen": 11272187456,
7670
  "num_train_epochs": 1,
7671
  "save_steps": 1000,
7672
  "stateful_callbacks": {
 
7681
  "attributes": {}
7682
  }
7683
  },
7684
+ "total_flos": 3.0154201610295706e+18,
7685
  "train_batch_size": 64,
7686
  "trial_name": null,
7687
  "trial_params": null