Training in progress, step 119000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1410301944
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:423179ea0149a7aaeacb5ccaa10149a8392d7f119d23b5e82ddb6e09d76ee4bf
|
| 3 |
size 1410301944
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 2820185786
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b71ae6a920aee5962a410d286e3547ba68e15be1375e1283ae48d23a63cbab16
|
| 3 |
size 2820185786
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b8f8fb2244d43602b2b223fa5f88e945c708dd60e4c4c5e962793b5f1f77fe7b
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a469da166349e663b52b425176faaf03bae4cb82a5020b6687129f2f779fc711
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 1.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -21012,11 +21012,189 @@
|
|
| 21012 |
"eval_steps_per_second": 15.111,
|
| 21013 |
"num_input_tokens_seen": 61856020192,
|
| 21014 |
"step": 118000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21015 |
}
|
| 21016 |
],
|
| 21017 |
"logging_steps": 50,
|
| 21018 |
"max_steps": 140000,
|
| 21019 |
-
"num_input_tokens_seen":
|
| 21020 |
"num_train_epochs": 2,
|
| 21021 |
"save_steps": 1000,
|
| 21022 |
"stateful_callbacks": {
|
|
@@ -21031,7 +21209,7 @@
|
|
| 21031 |
"attributes": {}
|
| 21032 |
}
|
| 21033 |
},
|
| 21034 |
-
"total_flos": 1.
|
| 21035 |
"train_batch_size": 32,
|
| 21036 |
"trial_name": null,
|
| 21037 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 1.1352683735406703,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 119000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 21012 |
"eval_steps_per_second": 15.111,
|
| 21013 |
"num_input_tokens_seen": 61856020192,
|
| 21014 |
"step": 118000
|
| 21015 |
+
},
|
| 21016 |
+
{
|
| 21017 |
+
"epoch": 1.1262053257330933,
|
| 21018 |
+
"grad_norm": 0.15097704529762268,
|
| 21019 |
+
"learning_rate": 0.0008891607141781631,
|
| 21020 |
+
"loss": 2.0857,
|
| 21021 |
+
"num_input_tokens_seen": 61882234592,
|
| 21022 |
+
"step": 118050
|
| 21023 |
+
},
|
| 21024 |
+
{
|
| 21025 |
+
"epoch": 1.1266823282492815,
|
| 21026 |
+
"grad_norm": 0.1383848935365677,
|
| 21027 |
+
"learning_rate": 0.0008873934395068005,
|
| 21028 |
+
"loss": 2.0858,
|
| 21029 |
+
"num_input_tokens_seen": 61908441120,
|
| 21030 |
+
"step": 118100
|
| 21031 |
+
},
|
| 21032 |
+
{
|
| 21033 |
+
"epoch": 1.1271593307654697,
|
| 21034 |
+
"grad_norm": 0.14688968658447266,
|
| 21035 |
+
"learning_rate": 0.0008856139728393666,
|
| 21036 |
+
"loss": 2.085,
|
| 21037 |
+
"num_input_tokens_seen": 61934653984,
|
| 21038 |
+
"step": 118150
|
| 21039 |
+
},
|
| 21040 |
+
{
|
| 21041 |
+
"epoch": 1.1276363332816581,
|
| 21042 |
+
"grad_norm": 0.14446312189102173,
|
| 21043 |
+
"learning_rate": 0.0008838223701790055,
|
| 21044 |
+
"loss": 2.0765,
|
| 21045 |
+
"num_input_tokens_seen": 61960867808,
|
| 21046 |
+
"step": 118200
|
| 21047 |
+
},
|
| 21048 |
+
{
|
| 21049 |
+
"epoch": 1.1281133357978463,
|
| 21050 |
+
"grad_norm": 0.1389646828174591,
|
| 21051 |
+
"learning_rate": 0.0008820186879108038,
|
| 21052 |
+
"loss": 2.0816,
|
| 21053 |
+
"num_input_tokens_seen": 61987070336,
|
| 21054 |
+
"step": 118250
|
| 21055 |
+
},
|
| 21056 |
+
{
|
| 21057 |
+
"epoch": 1.1285903383140345,
|
| 21058 |
+
"grad_norm": 0.14348453283309937,
|
| 21059 |
+
"learning_rate": 0.0008802029828000156,
|
| 21060 |
+
"loss": 2.0875,
|
| 21061 |
+
"num_input_tokens_seen": 62013276640,
|
| 21062 |
+
"step": 118300
|
| 21063 |
+
},
|
| 21064 |
+
{
|
| 21065 |
+
"epoch": 1.129067340830223,
|
| 21066 |
+
"grad_norm": 0.14246419072151184,
|
| 21067 |
+
"learning_rate": 0.0008783753119902765,
|
| 21068 |
+
"loss": 2.0828,
|
| 21069 |
+
"num_input_tokens_seen": 62039490144,
|
| 21070 |
+
"step": 118350
|
| 21071 |
+
},
|
| 21072 |
+
{
|
| 21073 |
+
"epoch": 1.1295443433464112,
|
| 21074 |
+
"grad_norm": 0.13848936557769775,
|
| 21075 |
+
"learning_rate": 0.0008765357330018055,
|
| 21076 |
+
"loss": 2.0895,
|
| 21077 |
+
"num_input_tokens_seen": 62065704544,
|
| 21078 |
+
"step": 118400
|
| 21079 |
+
},
|
| 21080 |
+
{
|
| 21081 |
+
"epoch": 1.1300213458625994,
|
| 21082 |
+
"grad_norm": 0.14894653856754303,
|
| 21083 |
+
"learning_rate": 0.0008746843037295936,
|
| 21084 |
+
"loss": 2.079,
|
| 21085 |
+
"num_input_tokens_seen": 62091916704,
|
| 21086 |
+
"step": 118450
|
| 21087 |
+
},
|
| 21088 |
+
{
|
| 21089 |
+
"epoch": 1.1304983483787878,
|
| 21090 |
+
"grad_norm": 0.1354195922613144,
|
| 21091 |
+
"learning_rate": 0.0008728210824415827,
|
| 21092 |
+
"loss": 2.0836,
|
| 21093 |
+
"num_input_tokens_seen": 62118128864,
|
| 21094 |
+
"step": 118500
|
| 21095 |
+
},
|
| 21096 |
+
{
|
| 21097 |
+
"epoch": 1.1304983483787878,
|
| 21098 |
+
"eval_loss": 2.004451274871826,
|
| 21099 |
+
"eval_runtime": 82.4857,
|
| 21100 |
+
"eval_samples_per_second": 60.617,
|
| 21101 |
+
"eval_steps_per_second": 15.154,
|
| 21102 |
+
"num_input_tokens_seen": 62118128864,
|
| 21103 |
+
"step": 118500
|
| 21104 |
+
},
|
| 21105 |
+
{
|
| 21106 |
+
"epoch": 1.130975350894976,
|
| 21107 |
+
"grad_norm": 0.14576098322868347,
|
| 21108 |
+
"learning_rate": 0.0008709461277768318,
|
| 21109 |
+
"loss": 2.0912,
|
| 21110 |
+
"num_input_tokens_seen": 62144343264,
|
| 21111 |
+
"step": 118550
|
| 21112 |
+
},
|
| 21113 |
+
{
|
| 21114 |
+
"epoch": 1.1314523534111642,
|
| 21115 |
+
"grad_norm": 0.14351360499858856,
|
| 21116 |
+
"learning_rate": 0.0008690594987436704,
|
| 21117 |
+
"loss": 2.0777,
|
| 21118 |
+
"num_input_tokens_seen": 62170554112,
|
| 21119 |
+
"step": 118600
|
| 21120 |
+
},
|
| 21121 |
+
{
|
| 21122 |
+
"epoch": 1.1319293559273524,
|
| 21123 |
+
"grad_norm": 0.14756879210472107,
|
| 21124 |
+
"learning_rate": 0.0008671612547178428,
|
| 21125 |
+
"loss": 2.0907,
|
| 21126 |
+
"num_input_tokens_seen": 62196764384,
|
| 21127 |
+
"step": 118650
|
| 21128 |
+
},
|
| 21129 |
+
{
|
| 21130 |
+
"epoch": 1.1324063584435409,
|
| 21131 |
+
"grad_norm": 0.15026496350765228,
|
| 21132 |
+
"learning_rate": 0.0008652514554406388,
|
| 21133 |
+
"loss": 2.0857,
|
| 21134 |
+
"num_input_tokens_seen": 62222966592,
|
| 21135 |
+
"step": 118700
|
| 21136 |
+
},
|
| 21137 |
+
{
|
| 21138 |
+
"epoch": 1.132883360959729,
|
| 21139 |
+
"grad_norm": 0.13817134499549866,
|
| 21140 |
+
"learning_rate": 0.0008633301610170136,
|
| 21141 |
+
"loss": 2.0851,
|
| 21142 |
+
"num_input_tokens_seen": 62249176192,
|
| 21143 |
+
"step": 118750
|
| 21144 |
+
},
|
| 21145 |
+
{
|
| 21146 |
+
"epoch": 1.1333603634759173,
|
| 21147 |
+
"grad_norm": 0.13346219062805176,
|
| 21148 |
+
"learning_rate": 0.0008613974319136957,
|
| 21149 |
+
"loss": 2.0856,
|
| 21150 |
+
"num_input_tokens_seen": 62275388064,
|
| 21151 |
+
"step": 118800
|
| 21152 |
+
},
|
| 21153 |
+
{
|
| 21154 |
+
"epoch": 1.1338373659921057,
|
| 21155 |
+
"grad_norm": 0.14300605654716492,
|
| 21156 |
+
"learning_rate": 0.0008594533289572853,
|
| 21157 |
+
"loss": 2.0835,
|
| 21158 |
+
"num_input_tokens_seen": 62301602464,
|
| 21159 |
+
"step": 118850
|
| 21160 |
+
},
|
| 21161 |
+
{
|
| 21162 |
+
"epoch": 1.134314368508294,
|
| 21163 |
+
"grad_norm": 0.13790345191955566,
|
| 21164 |
+
"learning_rate": 0.0008574979133323377,
|
| 21165 |
+
"loss": 2.0811,
|
| 21166 |
+
"num_input_tokens_seen": 62327812128,
|
| 21167 |
+
"step": 118900
|
| 21168 |
+
},
|
| 21169 |
+
{
|
| 21170 |
+
"epoch": 1.1347913710244821,
|
| 21171 |
+
"grad_norm": 0.1419474184513092,
|
| 21172 |
+
"learning_rate": 0.0008555312465794402,
|
| 21173 |
+
"loss": 2.0783,
|
| 21174 |
+
"num_input_tokens_seen": 62354024288,
|
| 21175 |
+
"step": 118950
|
| 21176 |
+
},
|
| 21177 |
+
{
|
| 21178 |
+
"epoch": 1.1352683735406703,
|
| 21179 |
+
"grad_norm": 0.15154699981212616,
|
| 21180 |
+
"learning_rate": 0.0008535533905932737,
|
| 21181 |
+
"loss": 2.0858,
|
| 21182 |
+
"num_input_tokens_seen": 62380238112,
|
| 21183 |
+
"step": 119000
|
| 21184 |
+
},
|
| 21185 |
+
{
|
| 21186 |
+
"epoch": 1.1352683735406703,
|
| 21187 |
+
"eval_loss": 2.0006425380706787,
|
| 21188 |
+
"eval_runtime": 82.1764,
|
| 21189 |
+
"eval_samples_per_second": 60.845,
|
| 21190 |
+
"eval_steps_per_second": 15.211,
|
| 21191 |
+
"num_input_tokens_seen": 62380238112,
|
| 21192 |
+
"step": 119000
|
| 21193 |
}
|
| 21194 |
],
|
| 21195 |
"logging_steps": 50,
|
| 21196 |
"max_steps": 140000,
|
| 21197 |
+
"num_input_tokens_seen": 62380238112,
|
| 21198 |
"num_train_epochs": 2,
|
| 21199 |
"save_steps": 1000,
|
| 21200 |
"stateful_callbacks": {
|
|
|
|
| 21209 |
"attributes": {}
|
| 21210 |
}
|
| 21211 |
},
|
| 21212 |
+
"total_flos": 1.1040164330280837e+20,
|
| 21213 |
"train_batch_size": 32,
|
| 21214 |
"trial_name": null,
|
| 21215 |
"trial_params": null
|