Azrail commited on
Commit
fc9192a
·
verified ·
1 Parent(s): 3dc6315

Training in progress, step 51000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9c9956ceaa01a8262c17e82fea9ac349503f1643baa686fe83baf73d6c182cfd
3
  size 301235464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b71ed16611cd95fe8479b9b5158a65681e32cd86fc06fd6104792dca5e0ea90c
3
  size 301235464
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f9a8f8b0ff9c7ab62e432b714de9517f6859e2ebcb731ff15954b08eab3fa5fd
3
  size 602335994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e4d0121bc94172a095cdea5c65ddbc39cc2a2d68c3e7dea1521191e5bf66d6e4
3
  size 602335994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:08d6a67f7616cccd33f77a5e076df0611e7b35eb8ba28bbeb4122e81eca5afa0
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:871241677306799dd94bb012f99e77b35a49885274956fc7cf6b8c017fdd6180
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:66168c288d1955c1c664cfa64be79d9023fb79ca5529a1e6b201d572885b2dfe
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38e628546b6b3793b4db9c04b0c48bd7f457b5c91e760c9c29b133754fb90815
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.23850125809413644,
6
  "eval_steps": 500,
7
- "global_step": 50000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -8908,11 +8908,189 @@
8908
  "eval_steps_per_second": 23.298,
8909
  "num_input_tokens_seen": 13107195456,
8910
  "step": 50000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8911
  }
8912
  ],
8913
  "logging_steps": 50,
8914
  "max_steps": 70000,
8915
- "num_input_tokens_seen": 13107195456,
8916
  "num_train_epochs": 1,
8917
  "save_steps": 1000,
8918
  "stateful_callbacks": {
@@ -8927,7 +9105,7 @@
8927
  "attributes": {}
8928
  }
8929
  },
8930
- "total_flos": 3.5063027107076506e+18,
8931
  "train_batch_size": 64,
8932
  "trial_name": null,
8933
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.24327128325601918,
6
  "eval_steps": 500,
7
+ "global_step": 51000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
8908
  "eval_steps_per_second": 23.298,
8909
  "num_input_tokens_seen": 13107195456,
8910
  "step": 50000
8911
+ },
8912
+ {
8913
+ "epoch": 0.23873975935223057,
8914
+ "grad_norm": 0.5069316029548645,
8915
+ "learning_rate": 0.001,
8916
+ "loss": 2.6591,
8917
+ "num_input_tokens_seen": 13120302656,
8918
+ "step": 50050
8919
+ },
8920
+ {
8921
+ "epoch": 0.23897826061032473,
8922
+ "grad_norm": 0.21306034922599792,
8923
+ "learning_rate": 0.001,
8924
+ "loss": 2.6455,
8925
+ "num_input_tokens_seen": 13133409856,
8926
+ "step": 50100
8927
+ },
8928
+ {
8929
+ "epoch": 0.23921676186841886,
8930
+ "grad_norm": 0.2045888900756836,
8931
+ "learning_rate": 0.001,
8932
+ "loss": 2.6227,
8933
+ "num_input_tokens_seen": 13146517056,
8934
+ "step": 50150
8935
+ },
8936
+ {
8937
+ "epoch": 0.239455263126513,
8938
+ "grad_norm": 0.2335623949766159,
8939
+ "learning_rate": 0.001,
8940
+ "loss": 2.6097,
8941
+ "num_input_tokens_seen": 13159624256,
8942
+ "step": 50200
8943
+ },
8944
+ {
8945
+ "epoch": 0.23969376438460713,
8946
+ "grad_norm": 0.19884036481380463,
8947
+ "learning_rate": 0.001,
8948
+ "loss": 2.6189,
8949
+ "num_input_tokens_seen": 13172731456,
8950
+ "step": 50250
8951
+ },
8952
+ {
8953
+ "epoch": 0.23993226564270126,
8954
+ "grad_norm": 0.21080589294433594,
8955
+ "learning_rate": 0.001,
8956
+ "loss": 2.6057,
8957
+ "num_input_tokens_seen": 13185838656,
8958
+ "step": 50300
8959
+ },
8960
+ {
8961
+ "epoch": 0.2401707669007954,
8962
+ "grad_norm": 0.21613669395446777,
8963
+ "learning_rate": 0.001,
8964
+ "loss": 2.6045,
8965
+ "num_input_tokens_seen": 13198945856,
8966
+ "step": 50350
8967
+ },
8968
+ {
8969
+ "epoch": 0.24040926815888955,
8970
+ "grad_norm": 0.2029023915529251,
8971
+ "learning_rate": 0.001,
8972
+ "loss": 2.6127,
8973
+ "num_input_tokens_seen": 13212053056,
8974
+ "step": 50400
8975
+ },
8976
+ {
8977
+ "epoch": 0.24064776941698368,
8978
+ "grad_norm": 0.2275777906179428,
8979
+ "learning_rate": 0.001,
8980
+ "loss": 2.6149,
8981
+ "num_input_tokens_seen": 13225160256,
8982
+ "step": 50450
8983
+ },
8984
+ {
8985
+ "epoch": 0.2408862706750778,
8986
+ "grad_norm": 0.3332397937774658,
8987
+ "learning_rate": 0.001,
8988
+ "loss": 2.6013,
8989
+ "num_input_tokens_seen": 13238267456,
8990
+ "step": 50500
8991
+ },
8992
+ {
8993
+ "epoch": 0.2408862706750778,
8994
+ "eval_loss": 2.5022270679473877,
8995
+ "eval_runtime": 53.5942,
8996
+ "eval_samples_per_second": 93.294,
8997
+ "eval_steps_per_second": 23.323,
8998
+ "num_input_tokens_seen": 13238267456,
8999
+ "step": 50500
9000
+ },
9001
+ {
9002
+ "epoch": 0.24112477193317194,
9003
+ "grad_norm": 0.2197851538658142,
9004
+ "learning_rate": 0.001,
9005
+ "loss": 2.6326,
9006
+ "num_input_tokens_seen": 13251374656,
9007
+ "step": 50550
9008
+ },
9009
+ {
9010
+ "epoch": 0.24136327319126608,
9011
+ "grad_norm": 0.2201780080795288,
9012
+ "learning_rate": 0.001,
9013
+ "loss": 2.6265,
9014
+ "num_input_tokens_seen": 13264481856,
9015
+ "step": 50600
9016
+ },
9017
+ {
9018
+ "epoch": 0.2416017744493602,
9019
+ "grad_norm": 0.2196362316608429,
9020
+ "learning_rate": 0.001,
9021
+ "loss": 2.6272,
9022
+ "num_input_tokens_seen": 13277589056,
9023
+ "step": 50650
9024
+ },
9025
+ {
9026
+ "epoch": 0.24184027570745437,
9027
+ "grad_norm": 0.2234160453081131,
9028
+ "learning_rate": 0.001,
9029
+ "loss": 2.6178,
9030
+ "num_input_tokens_seen": 13290696256,
9031
+ "step": 50700
9032
+ },
9033
+ {
9034
+ "epoch": 0.2420787769655485,
9035
+ "grad_norm": 0.24019016325473785,
9036
+ "learning_rate": 0.001,
9037
+ "loss": 2.6142,
9038
+ "num_input_tokens_seen": 13303803456,
9039
+ "step": 50750
9040
+ },
9041
+ {
9042
+ "epoch": 0.24231727822364263,
9043
+ "grad_norm": 0.21481236815452576,
9044
+ "learning_rate": 0.001,
9045
+ "loss": 2.6149,
9046
+ "num_input_tokens_seen": 13316910656,
9047
+ "step": 50800
9048
+ },
9049
+ {
9050
+ "epoch": 0.24255577948173676,
9051
+ "grad_norm": 0.20477178692817688,
9052
+ "learning_rate": 0.001,
9053
+ "loss": 2.5977,
9054
+ "num_input_tokens_seen": 13330017856,
9055
+ "step": 50850
9056
+ },
9057
+ {
9058
+ "epoch": 0.2427942807398309,
9059
+ "grad_norm": 0.20742499828338623,
9060
+ "learning_rate": 0.001,
9061
+ "loss": 2.6153,
9062
+ "num_input_tokens_seen": 13343125056,
9063
+ "step": 50900
9064
+ },
9065
+ {
9066
+ "epoch": 0.24303278199792505,
9067
+ "grad_norm": 0.21933062374591827,
9068
+ "learning_rate": 0.001,
9069
+ "loss": 2.5966,
9070
+ "num_input_tokens_seen": 13356232256,
9071
+ "step": 50950
9072
+ },
9073
+ {
9074
+ "epoch": 0.24327128325601918,
9075
+ "grad_norm": 0.3282420337200165,
9076
+ "learning_rate": 0.001,
9077
+ "loss": 2.6063,
9078
+ "num_input_tokens_seen": 13369339456,
9079
+ "step": 51000
9080
+ },
9081
+ {
9082
+ "epoch": 0.24327128325601918,
9083
+ "eval_loss": 2.4981296062469482,
9084
+ "eval_runtime": 53.5536,
9085
+ "eval_samples_per_second": 93.364,
9086
+ "eval_steps_per_second": 23.341,
9087
+ "num_input_tokens_seen": 13369339456,
9088
+ "step": 51000
9089
  }
9090
  ],
9091
  "logging_steps": 50,
9092
  "max_steps": 70000,
9093
+ "num_input_tokens_seen": 13369339456,
9094
  "num_train_epochs": 1,
9095
  "save_steps": 1000,
9096
  "stateful_callbacks": {
 
9105
  "attributes": {}
9106
  }
9107
  },
9108
+ "total_flos": 3.5764287892330906e+18,
9109
  "train_batch_size": 64,
9110
  "trial_name": null,
9111
  "trial_params": null