Azrail commited on
Commit
378f100
·
verified ·
1 Parent(s): e5e4f4e

Training in progress, step 119000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cb226fadbf28661b9371114993dc12e49ac5975cdb3cc0b050988cda066eda63
3
  size 1410301944
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:423179ea0149a7aaeacb5ccaa10149a8392d7f119d23b5e82ddb6e09d76ee4bf
3
  size 1410301944
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:619f8c200e9aaadfdae5aad82237b7f7ba5a625617b8275ba58c98a0a1cd45f8
3
  size 2820185786
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b71ae6a920aee5962a410d286e3547ba68e15be1375e1283ae48d23a63cbab16
3
  size 2820185786
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eadabea5b840d3b07e42e9e423397807b167316e75ece7076e65c7e1fda35503
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8f8fb2244d43602b2b223fa5f88e945c708dd60e4c4c5e962793b5f1f77fe7b
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8c77e72696edbb72e0b5c20319181466e8d1ea3a266d160a365b8e9afc9f97b0
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a469da166349e663b52b425176faaf03bae4cb82a5020b6687129f2f779fc711
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.1257283232169049,
6
  "eval_steps": 500,
7
- "global_step": 118000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -21012,11 +21012,189 @@
21012
  "eval_steps_per_second": 15.111,
21013
  "num_input_tokens_seen": 61856020192,
21014
  "step": 118000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21015
  }
21016
  ],
21017
  "logging_steps": 50,
21018
  "max_steps": 140000,
21019
- "num_input_tokens_seen": 61856020192,
21020
  "num_train_epochs": 2,
21021
  "save_steps": 1000,
21022
  "stateful_callbacks": {
@@ -21031,7 +21209,7 @@
21031
  "attributes": {}
21032
  }
21033
  },
21034
- "total_flos": 1.0947387320175698e+20,
21035
  "train_batch_size": 32,
21036
  "trial_name": null,
21037
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.1352683735406703,
6
  "eval_steps": 500,
7
+ "global_step": 119000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
21012
  "eval_steps_per_second": 15.111,
21013
  "num_input_tokens_seen": 61856020192,
21014
  "step": 118000
21015
+ },
21016
+ {
21017
+ "epoch": 1.1262053257330933,
21018
+ "grad_norm": 0.15097704529762268,
21019
+ "learning_rate": 0.0008891607141781631,
21020
+ "loss": 2.0857,
21021
+ "num_input_tokens_seen": 61882234592,
21022
+ "step": 118050
21023
+ },
21024
+ {
21025
+ "epoch": 1.1266823282492815,
21026
+ "grad_norm": 0.1383848935365677,
21027
+ "learning_rate": 0.0008873934395068005,
21028
+ "loss": 2.0858,
21029
+ "num_input_tokens_seen": 61908441120,
21030
+ "step": 118100
21031
+ },
21032
+ {
21033
+ "epoch": 1.1271593307654697,
21034
+ "grad_norm": 0.14688968658447266,
21035
+ "learning_rate": 0.0008856139728393666,
21036
+ "loss": 2.085,
21037
+ "num_input_tokens_seen": 61934653984,
21038
+ "step": 118150
21039
+ },
21040
+ {
21041
+ "epoch": 1.1276363332816581,
21042
+ "grad_norm": 0.14446312189102173,
21043
+ "learning_rate": 0.0008838223701790055,
21044
+ "loss": 2.0765,
21045
+ "num_input_tokens_seen": 61960867808,
21046
+ "step": 118200
21047
+ },
21048
+ {
21049
+ "epoch": 1.1281133357978463,
21050
+ "grad_norm": 0.1389646828174591,
21051
+ "learning_rate": 0.0008820186879108038,
21052
+ "loss": 2.0816,
21053
+ "num_input_tokens_seen": 61987070336,
21054
+ "step": 118250
21055
+ },
21056
+ {
21057
+ "epoch": 1.1285903383140345,
21058
+ "grad_norm": 0.14348453283309937,
21059
+ "learning_rate": 0.0008802029828000156,
21060
+ "loss": 2.0875,
21061
+ "num_input_tokens_seen": 62013276640,
21062
+ "step": 118300
21063
+ },
21064
+ {
21065
+ "epoch": 1.129067340830223,
21066
+ "grad_norm": 0.14246419072151184,
21067
+ "learning_rate": 0.0008783753119902765,
21068
+ "loss": 2.0828,
21069
+ "num_input_tokens_seen": 62039490144,
21070
+ "step": 118350
21071
+ },
21072
+ {
21073
+ "epoch": 1.1295443433464112,
21074
+ "grad_norm": 0.13848936557769775,
21075
+ "learning_rate": 0.0008765357330018055,
21076
+ "loss": 2.0895,
21077
+ "num_input_tokens_seen": 62065704544,
21078
+ "step": 118400
21079
+ },
21080
+ {
21081
+ "epoch": 1.1300213458625994,
21082
+ "grad_norm": 0.14894653856754303,
21083
+ "learning_rate": 0.0008746843037295936,
21084
+ "loss": 2.079,
21085
+ "num_input_tokens_seen": 62091916704,
21086
+ "step": 118450
21087
+ },
21088
+ {
21089
+ "epoch": 1.1304983483787878,
21090
+ "grad_norm": 0.1354195922613144,
21091
+ "learning_rate": 0.0008728210824415827,
21092
+ "loss": 2.0836,
21093
+ "num_input_tokens_seen": 62118128864,
21094
+ "step": 118500
21095
+ },
21096
+ {
21097
+ "epoch": 1.1304983483787878,
21098
+ "eval_loss": 2.004451274871826,
21099
+ "eval_runtime": 82.4857,
21100
+ "eval_samples_per_second": 60.617,
21101
+ "eval_steps_per_second": 15.154,
21102
+ "num_input_tokens_seen": 62118128864,
21103
+ "step": 118500
21104
+ },
21105
+ {
21106
+ "epoch": 1.130975350894976,
21107
+ "grad_norm": 0.14576098322868347,
21108
+ "learning_rate": 0.0008709461277768318,
21109
+ "loss": 2.0912,
21110
+ "num_input_tokens_seen": 62144343264,
21111
+ "step": 118550
21112
+ },
21113
+ {
21114
+ "epoch": 1.1314523534111642,
21115
+ "grad_norm": 0.14351360499858856,
21116
+ "learning_rate": 0.0008690594987436704,
21117
+ "loss": 2.0777,
21118
+ "num_input_tokens_seen": 62170554112,
21119
+ "step": 118600
21120
+ },
21121
+ {
21122
+ "epoch": 1.1319293559273524,
21123
+ "grad_norm": 0.14756879210472107,
21124
+ "learning_rate": 0.0008671612547178428,
21125
+ "loss": 2.0907,
21126
+ "num_input_tokens_seen": 62196764384,
21127
+ "step": 118650
21128
+ },
21129
+ {
21130
+ "epoch": 1.1324063584435409,
21131
+ "grad_norm": 0.15026496350765228,
21132
+ "learning_rate": 0.0008652514554406388,
21133
+ "loss": 2.0857,
21134
+ "num_input_tokens_seen": 62222966592,
21135
+ "step": 118700
21136
+ },
21137
+ {
21138
+ "epoch": 1.132883360959729,
21139
+ "grad_norm": 0.13817134499549866,
21140
+ "learning_rate": 0.0008633301610170136,
21141
+ "loss": 2.0851,
21142
+ "num_input_tokens_seen": 62249176192,
21143
+ "step": 118750
21144
+ },
21145
+ {
21146
+ "epoch": 1.1333603634759173,
21147
+ "grad_norm": 0.13346219062805176,
21148
+ "learning_rate": 0.0008613974319136957,
21149
+ "loss": 2.0856,
21150
+ "num_input_tokens_seen": 62275388064,
21151
+ "step": 118800
21152
+ },
21153
+ {
21154
+ "epoch": 1.1338373659921057,
21155
+ "grad_norm": 0.14300605654716492,
21156
+ "learning_rate": 0.0008594533289572853,
21157
+ "loss": 2.0835,
21158
+ "num_input_tokens_seen": 62301602464,
21159
+ "step": 118850
21160
+ },
21161
+ {
21162
+ "epoch": 1.134314368508294,
21163
+ "grad_norm": 0.13790345191955566,
21164
+ "learning_rate": 0.0008574979133323377,
21165
+ "loss": 2.0811,
21166
+ "num_input_tokens_seen": 62327812128,
21167
+ "step": 118900
21168
+ },
21169
+ {
21170
+ "epoch": 1.1347913710244821,
21171
+ "grad_norm": 0.1419474184513092,
21172
+ "learning_rate": 0.0008555312465794402,
21173
+ "loss": 2.0783,
21174
+ "num_input_tokens_seen": 62354024288,
21175
+ "step": 118950
21176
+ },
21177
+ {
21178
+ "epoch": 1.1352683735406703,
21179
+ "grad_norm": 0.15154699981212616,
21180
+ "learning_rate": 0.0008535533905932737,
21181
+ "loss": 2.0858,
21182
+ "num_input_tokens_seen": 62380238112,
21183
+ "step": 119000
21184
+ },
21185
+ {
21186
+ "epoch": 1.1352683735406703,
21187
+ "eval_loss": 2.0006425380706787,
21188
+ "eval_runtime": 82.1764,
21189
+ "eval_samples_per_second": 60.845,
21190
+ "eval_steps_per_second": 15.211,
21191
+ "num_input_tokens_seen": 62380238112,
21192
+ "step": 119000
21193
  }
21194
  ],
21195
  "logging_steps": 50,
21196
  "max_steps": 140000,
21197
+ "num_input_tokens_seen": 62380238112,
21198
  "num_train_epochs": 2,
21199
  "save_steps": 1000,
21200
  "stateful_callbacks": {
 
21209
  "attributes": {}
21210
  }
21211
  },
21212
+ "total_flos": 1.1040164330280837e+20,
21213
  "train_batch_size": 32,
21214
  "trial_name": null,
21215
  "trial_params": null