Azrail commited on
Commit
1f3fda8
·
verified ·
1 Parent(s): 0e7102a

Training in progress, step 44000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8ef4495c71186600e4deb9626160177c8fff186d1b83ba3e101354820ff0b557
3
  size 301235464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9605858ca8b64eb89cb8c33fd56e7ec671551b1e5005f2598e074ca5b397cafd
3
  size 301235464
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f4fe7eac54f364f5be220dedbdbb5b62a67232200bda7c79c78a104963651e13
3
  size 602335994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e2584fadfe92de84830b6f68a11ff9f4508f42d733151a8e29faa8885164fa9e
3
  size 602335994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:37df8b5d43f22ad1aaa4d7dfd1f99c1668bea9e213ed7e601e62de46919c3f7c
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c93fe38009a049e639e4ec9c47956d4822c559f5ecfd6d8454c217a91259ec7
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c21038d5c74dc9feef98b9cc841f29561ac202ab70974b8a5e9d4e813a417597
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5356afb30d3aa5783dfb45e83d3ec8fbfdbc01397770efc134aa996a2dcb7311
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.20511108196095734,
6
  "eval_steps": 500,
7
- "global_step": 43000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -7662,11 +7662,189 @@
7662
  "eval_steps_per_second": 24.368,
7663
  "num_input_tokens_seen": 11272187456,
7664
  "step": 43000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7665
  }
7666
  ],
7667
  "logging_steps": 50,
7668
  "max_steps": 70000,
7669
- "num_input_tokens_seen": 11272187456,
7670
  "num_train_epochs": 1,
7671
  "save_steps": 1000,
7672
  "stateful_callbacks": {
@@ -7681,7 +7859,7 @@
7681
  "attributes": {}
7682
  }
7683
  },
7684
- "total_flos": 3.0154201610295706e+18,
7685
  "train_batch_size": 64,
7686
  "trial_name": null,
7687
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.20988110712284008,
6
  "eval_steps": 500,
7
+ "global_step": 44000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
7662
  "eval_steps_per_second": 24.368,
7663
  "num_input_tokens_seen": 11272187456,
7664
  "step": 43000
7665
+ },
7666
+ {
7667
+ "epoch": 0.20534958321905147,
7668
+ "grad_norm": 0.22746357321739197,
7669
+ "learning_rate": 0.001,
7670
+ "loss": 2.6281,
7671
+ "num_input_tokens_seen": 11285294656,
7672
+ "step": 43050
7673
+ },
7674
+ {
7675
+ "epoch": 0.2055880844771456,
7676
+ "grad_norm": 0.21107150614261627,
7677
+ "learning_rate": 0.001,
7678
+ "loss": 2.6154,
7679
+ "num_input_tokens_seen": 11298401856,
7680
+ "step": 43100
7681
+ },
7682
+ {
7683
+ "epoch": 0.20582658573523976,
7684
+ "grad_norm": 0.18025045096874237,
7685
+ "learning_rate": 0.001,
7686
+ "loss": 2.6141,
7687
+ "num_input_tokens_seen": 11311509056,
7688
+ "step": 43150
7689
+ },
7690
+ {
7691
+ "epoch": 0.2060650869933339,
7692
+ "grad_norm": 0.2009642869234085,
7693
+ "learning_rate": 0.001,
7694
+ "loss": 2.6133,
7695
+ "num_input_tokens_seen": 11324616256,
7696
+ "step": 43200
7697
+ },
7698
+ {
7699
+ "epoch": 0.20630358825142803,
7700
+ "grad_norm": 0.1872788518667221,
7701
+ "learning_rate": 0.001,
7702
+ "loss": 2.6197,
7703
+ "num_input_tokens_seen": 11337723456,
7704
+ "step": 43250
7705
+ },
7706
+ {
7707
+ "epoch": 0.20654208950952216,
7708
+ "grad_norm": 0.216310054063797,
7709
+ "learning_rate": 0.001,
7710
+ "loss": 2.6353,
7711
+ "num_input_tokens_seen": 11350830656,
7712
+ "step": 43300
7713
+ },
7714
+ {
7715
+ "epoch": 0.2067805907676163,
7716
+ "grad_norm": 0.2705513536930084,
7717
+ "learning_rate": 0.001,
7718
+ "loss": 2.6333,
7719
+ "num_input_tokens_seen": 11363937856,
7720
+ "step": 43350
7721
+ },
7722
+ {
7723
+ "epoch": 0.20701909202571045,
7724
+ "grad_norm": 0.3040550649166107,
7725
+ "learning_rate": 0.001,
7726
+ "loss": 2.6094,
7727
+ "num_input_tokens_seen": 11377045056,
7728
+ "step": 43400
7729
+ },
7730
+ {
7731
+ "epoch": 0.20725759328380458,
7732
+ "grad_norm": 0.2075599879026413,
7733
+ "learning_rate": 0.001,
7734
+ "loss": 2.6225,
7735
+ "num_input_tokens_seen": 11390152256,
7736
+ "step": 43450
7737
+ },
7738
+ {
7739
+ "epoch": 0.2074960945418987,
7740
+ "grad_norm": 0.22293590009212494,
7741
+ "learning_rate": 0.001,
7742
+ "loss": 2.6271,
7743
+ "num_input_tokens_seen": 11403259456,
7744
+ "step": 43500
7745
+ },
7746
+ {
7747
+ "epoch": 0.2074960945418987,
7748
+ "eval_loss": 2.5097975730895996,
7749
+ "eval_runtime": 51.7037,
7750
+ "eval_samples_per_second": 96.705,
7751
+ "eval_steps_per_second": 24.176,
7752
+ "num_input_tokens_seen": 11403259456,
7753
+ "step": 43500
7754
+ },
7755
+ {
7756
+ "epoch": 0.20773459579999284,
7757
+ "grad_norm": 0.21221335232257843,
7758
+ "learning_rate": 0.001,
7759
+ "loss": 2.618,
7760
+ "num_input_tokens_seen": 11416366656,
7761
+ "step": 43550
7762
+ },
7763
+ {
7764
+ "epoch": 0.20797309705808698,
7765
+ "grad_norm": 0.19894948601722717,
7766
+ "learning_rate": 0.001,
7767
+ "loss": 2.6305,
7768
+ "num_input_tokens_seen": 11429473856,
7769
+ "step": 43600
7770
+ },
7771
+ {
7772
+ "epoch": 0.2082115983161811,
7773
+ "grad_norm": 0.29371336102485657,
7774
+ "learning_rate": 0.001,
7775
+ "loss": 2.6211,
7776
+ "num_input_tokens_seen": 11442581056,
7777
+ "step": 43650
7778
+ },
7779
+ {
7780
+ "epoch": 0.20845009957427527,
7781
+ "grad_norm": 0.19441936910152435,
7782
+ "learning_rate": 0.001,
7783
+ "loss": 2.6355,
7784
+ "num_input_tokens_seen": 11455688256,
7785
+ "step": 43700
7786
+ },
7787
+ {
7788
+ "epoch": 0.2086886008323694,
7789
+ "grad_norm": 0.19868114590644836,
7790
+ "learning_rate": 0.001,
7791
+ "loss": 2.6206,
7792
+ "num_input_tokens_seen": 11468795456,
7793
+ "step": 43750
7794
+ },
7795
+ {
7796
+ "epoch": 0.20892710209046353,
7797
+ "grad_norm": 0.19971340894699097,
7798
+ "learning_rate": 0.001,
7799
+ "loss": 2.6124,
7800
+ "num_input_tokens_seen": 11481902656,
7801
+ "step": 43800
7802
+ },
7803
+ {
7804
+ "epoch": 0.20916560334855766,
7805
+ "grad_norm": 0.22261051833629608,
7806
+ "learning_rate": 0.001,
7807
+ "loss": 2.623,
7808
+ "num_input_tokens_seen": 11495009856,
7809
+ "step": 43850
7810
+ },
7811
+ {
7812
+ "epoch": 0.2094041046066518,
7813
+ "grad_norm": 0.20982281863689423,
7814
+ "learning_rate": 0.001,
7815
+ "loss": 2.6182,
7816
+ "num_input_tokens_seen": 11508117056,
7817
+ "step": 43900
7818
+ },
7819
+ {
7820
+ "epoch": 0.20964260586474592,
7821
+ "grad_norm": 0.2216535359621048,
7822
+ "learning_rate": 0.001,
7823
+ "loss": 2.6086,
7824
+ "num_input_tokens_seen": 11521224256,
7825
+ "step": 43950
7826
+ },
7827
+ {
7828
+ "epoch": 0.20988110712284008,
7829
+ "grad_norm": 0.19298988580703735,
7830
+ "learning_rate": 0.001,
7831
+ "loss": 2.6364,
7832
+ "num_input_tokens_seen": 11534331456,
7833
+ "step": 44000
7834
+ },
7835
+ {
7836
+ "epoch": 0.20988110712284008,
7837
+ "eval_loss": 2.5009121894836426,
7838
+ "eval_runtime": 51.4356,
7839
+ "eval_samples_per_second": 97.209,
7840
+ "eval_steps_per_second": 24.302,
7841
+ "num_input_tokens_seen": 11534331456,
7842
+ "step": 44000
7843
  }
7844
  ],
7845
  "logging_steps": 50,
7846
  "max_steps": 70000,
7847
+ "num_input_tokens_seen": 11534331456,
7848
  "num_train_epochs": 1,
7849
  "save_steps": 1000,
7850
  "stateful_callbacks": {
 
7859
  "attributes": {}
7860
  }
7861
  },
7862
+ "total_flos": 3.0855462395550106e+18,
7863
  "train_batch_size": 64,
7864
  "trial_name": null,
7865
  "trial_params": null