Training in progress, step 63000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 301235464
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d0f52abb8596fb1c55e5609ec97ec3ea8479c701d5763f12612f03207baebfdc
|
| 3 |
size 301235464
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 602335994
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dc42234c3f4bb7923a06f1e41810d1e801108c51e07feed1ea66a8af7c05bc5a
|
| 3 |
size 602335994
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1a97095234a7b82e99cd1b23ba4db26c35942b8b4622876b166d0ce65b7c7110
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9ca0e8dbf69c9810c713183e067be8112924d576870302a9fb3c526f389826e7
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -11044,11 +11044,189 @@
|
|
| 11044 |
"eval_steps_per_second": 23.717,
|
| 11045 |
"num_input_tokens_seen": 16252928000,
|
| 11046 |
"step": 62000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11047 |
}
|
| 11048 |
],
|
| 11049 |
"logging_steps": 50,
|
| 11050 |
"max_steps": 70000,
|
| 11051 |
-
"num_input_tokens_seen":
|
| 11052 |
"num_train_epochs": 1,
|
| 11053 |
"save_steps": 1000,
|
| 11054 |
"stateful_callbacks": {
|
|
@@ -11063,7 +11241,7 @@
|
|
| 11063 |
"attributes": {}
|
| 11064 |
}
|
| 11065 |
},
|
| 11066 |
-
"total_flos": 4.
|
| 11067 |
"train_batch_size": 64,
|
| 11068 |
"trial_name": null,
|
| 11069 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.4237694290715918,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 63000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 11044 |
"eval_steps_per_second": 23.717,
|
| 11045 |
"num_input_tokens_seen": 16252928000,
|
| 11046 |
"step": 62000
|
| 11047 |
+
},
|
| 11048 |
+
{
|
| 11049 |
+
"epoch": 0.41737925514114715,
|
| 11050 |
+
"grad_norm": 0.19505389034748077,
|
| 11051 |
+
"learning_rate": 0.0004950913403314252,
|
| 11052 |
+
"loss": 2.9995,
|
| 11053 |
+
"num_input_tokens_seen": 16266035200,
|
| 11054 |
+
"step": 62050
|
| 11055 |
+
},
|
| 11056 |
+
{
|
| 11057 |
+
"epoch": 0.41771558008485477,
|
| 11058 |
+
"grad_norm": 0.18988089263439178,
|
| 11059 |
+
"learning_rate": 0.0004901831537696859,
|
| 11060 |
+
"loss": 3.0041,
|
| 11061 |
+
"num_input_tokens_seen": 16279142400,
|
| 11062 |
+
"step": 62100
|
| 11063 |
+
},
|
| 11064 |
+
{
|
| 11065 |
+
"epoch": 0.4180519050285624,
|
| 11066 |
+
"grad_norm": 0.19544407725334167,
|
| 11067 |
+
"learning_rate": 0.0004852759133760184,
|
| 11068 |
+
"loss": 3.0073,
|
| 11069 |
+
"num_input_tokens_seen": 16292249600,
|
| 11070 |
+
"step": 62150
|
| 11071 |
+
},
|
| 11072 |
+
{
|
| 11073 |
+
"epoch": 0.41838822997227,
|
| 11074 |
+
"grad_norm": 0.1884351521730423,
|
| 11075 |
+
"learning_rate": 0.00048037009212046586,
|
| 11076 |
+
"loss": 3.0035,
|
| 11077 |
+
"num_input_tokens_seen": 16305356800,
|
| 11078 |
+
"step": 62200
|
| 11079 |
+
},
|
| 11080 |
+
{
|
| 11081 |
+
"epoch": 0.4187245549159776,
|
| 11082 |
+
"grad_norm": 0.17927390336990356,
|
| 11083 |
+
"learning_rate": 0.000475466162836291,
|
| 11084 |
+
"loss": 2.9921,
|
| 11085 |
+
"num_input_tokens_seen": 16318464000,
|
| 11086 |
+
"step": 62250
|
| 11087 |
+
},
|
| 11088 |
+
{
|
| 11089 |
+
"epoch": 0.4190608798596852,
|
| 11090 |
+
"grad_norm": 0.18687283992767334,
|
| 11091 |
+
"learning_rate": 0.00047056459817440544,
|
| 11092 |
+
"loss": 3.0042,
|
| 11093 |
+
"num_input_tokens_seen": 16331571200,
|
| 11094 |
+
"step": 62300
|
| 11095 |
+
},
|
| 11096 |
+
{
|
| 11097 |
+
"epoch": 0.4193972048033928,
|
| 11098 |
+
"grad_norm": 0.18783149123191833,
|
| 11099 |
+
"learning_rate": 0.00046566587055781316,
|
| 11100 |
+
"loss": 3.0003,
|
| 11101 |
+
"num_input_tokens_seen": 16344678400,
|
| 11102 |
+
"step": 62350
|
| 11103 |
+
},
|
| 11104 |
+
{
|
| 11105 |
+
"epoch": 0.41973352974710043,
|
| 11106 |
+
"grad_norm": 0.18625770509243011,
|
| 11107 |
+
"learning_rate": 0.0004607704521360776,
|
| 11108 |
+
"loss": 3.0061,
|
| 11109 |
+
"num_input_tokens_seen": 16357785600,
|
| 11110 |
+
"step": 62400
|
| 11111 |
+
},
|
| 11112 |
+
{
|
| 11113 |
+
"epoch": 0.4200698546908081,
|
| 11114 |
+
"grad_norm": 0.20189669728279114,
|
| 11115 |
+
"learning_rate": 0.00045587881473981533,
|
| 11116 |
+
"loss": 2.9976,
|
| 11117 |
+
"num_input_tokens_seen": 16370892800,
|
| 11118 |
+
"step": 62450
|
| 11119 |
+
},
|
| 11120 |
+
{
|
| 11121 |
+
"epoch": 0.4204061796345157,
|
| 11122 |
+
"grad_norm": 0.19049198925495148,
|
| 11123 |
+
"learning_rate": 0.0004509914298352197,
|
| 11124 |
+
"loss": 3.0055,
|
| 11125 |
+
"num_input_tokens_seen": 16384000000,
|
| 11126 |
+
"step": 62500
|
| 11127 |
+
},
|
| 11128 |
+
{
|
| 11129 |
+
"epoch": 0.4204061796345157,
|
| 11130 |
+
"eval_loss": 2.896798849105835,
|
| 11131 |
+
"eval_runtime": 52.8908,
|
| 11132 |
+
"eval_samples_per_second": 94.534,
|
| 11133 |
+
"eval_steps_per_second": 23.634,
|
| 11134 |
+
"num_input_tokens_seen": 16384000000,
|
| 11135 |
+
"step": 62500
|
| 11136 |
+
},
|
| 11137 |
+
{
|
| 11138 |
+
"epoch": 0.4207425045782233,
|
| 11139 |
+
"grad_norm": 0.1667575091123581,
|
| 11140 |
+
"learning_rate": 0.00044610876847862033,
|
| 11141 |
+
"loss": 2.9929,
|
| 11142 |
+
"num_input_tokens_seen": 16397107200,
|
| 11143 |
+
"step": 62550
|
| 11144 |
+
},
|
| 11145 |
+
{
|
| 11146 |
+
"epoch": 0.42107882952193093,
|
| 11147 |
+
"grad_norm": 0.7176526188850403,
|
| 11148 |
+
"learning_rate": 0.00044123130127108126,
|
| 11149 |
+
"loss": 2.9918,
|
| 11150 |
+
"num_input_tokens_seen": 16410214400,
|
| 11151 |
+
"step": 62600
|
| 11152 |
+
},
|
| 11153 |
+
{
|
| 11154 |
+
"epoch": 0.42141515446563854,
|
| 11155 |
+
"grad_norm": 0.20578069984912872,
|
| 11156 |
+
"learning_rate": 0.00043635949831304343,
|
| 11157 |
+
"loss": 3.0037,
|
| 11158 |
+
"num_input_tokens_seen": 16423321600,
|
| 11159 |
+
"step": 62650
|
| 11160 |
+
},
|
| 11161 |
+
{
|
| 11162 |
+
"epoch": 0.42175147940934615,
|
| 11163 |
+
"grad_norm": 0.19712655246257782,
|
| 11164 |
+
"learning_rate": 0.0004314938291590161,
|
| 11165 |
+
"loss": 3.0142,
|
| 11166 |
+
"num_input_tokens_seen": 16436428800,
|
| 11167 |
+
"step": 62700
|
| 11168 |
+
},
|
| 11169 |
+
{
|
| 11170 |
+
"epoch": 0.42208780435305376,
|
| 11171 |
+
"grad_norm": 0.20189446210861206,
|
| 11172 |
+
"learning_rate": 0.00042663476277231917,
|
| 11173 |
+
"loss": 2.9983,
|
| 11174 |
+
"num_input_tokens_seen": 16449536000,
|
| 11175 |
+
"step": 62750
|
| 11176 |
+
},
|
| 11177 |
+
{
|
| 11178 |
+
"epoch": 0.4224241292967614,
|
| 11179 |
+
"grad_norm": 0.18463867902755737,
|
| 11180 |
+
"learning_rate": 0.0004217827674798845,
|
| 11181 |
+
"loss": 2.9971,
|
| 11182 |
+
"num_input_tokens_seen": 16462643200,
|
| 11183 |
+
"step": 62800
|
| 11184 |
+
},
|
| 11185 |
+
{
|
| 11186 |
+
"epoch": 0.422760454240469,
|
| 11187 |
+
"grad_norm": 0.17639389634132385,
|
| 11188 |
+
"learning_rate": 0.0004169383109271174,
|
| 11189 |
+
"loss": 3.0032,
|
| 11190 |
+
"num_input_tokens_seen": 16475750400,
|
| 11191 |
+
"step": 62850
|
| 11192 |
+
},
|
| 11193 |
+
{
|
| 11194 |
+
"epoch": 0.4230967791841766,
|
| 11195 |
+
"grad_norm": 0.1733781099319458,
|
| 11196 |
+
"learning_rate": 0.00041210186003282274,
|
| 11197 |
+
"loss": 2.9932,
|
| 11198 |
+
"num_input_tokens_seen": 16488857600,
|
| 11199 |
+
"step": 62900
|
| 11200 |
+
},
|
| 11201 |
+
{
|
| 11202 |
+
"epoch": 0.4234331041278842,
|
| 11203 |
+
"grad_norm": 0.17753124237060547,
|
| 11204 |
+
"learning_rate": 0.00040727388094420456,
|
| 11205 |
+
"loss": 3.0012,
|
| 11206 |
+
"num_input_tokens_seen": 16501964800,
|
| 11207 |
+
"step": 62950
|
| 11208 |
+
},
|
| 11209 |
+
{
|
| 11210 |
+
"epoch": 0.4237694290715918,
|
| 11211 |
+
"grad_norm": 0.180925652384758,
|
| 11212 |
+
"learning_rate": 0.00040245483899193594,
|
| 11213 |
+
"loss": 2.9823,
|
| 11214 |
+
"num_input_tokens_seen": 16515072000,
|
| 11215 |
+
"step": 63000
|
| 11216 |
+
},
|
| 11217 |
+
{
|
| 11218 |
+
"epoch": 0.4237694290715918,
|
| 11219 |
+
"eval_loss": 2.8929545879364014,
|
| 11220 |
+
"eval_runtime": 53.37,
|
| 11221 |
+
"eval_samples_per_second": 93.686,
|
| 11222 |
+
"eval_steps_per_second": 23.421,
|
| 11223 |
+
"num_input_tokens_seen": 16515072000,
|
| 11224 |
+
"step": 63000
|
| 11225 |
}
|
| 11226 |
],
|
| 11227 |
"logging_steps": 50,
|
| 11228 |
"max_steps": 70000,
|
| 11229 |
+
"num_input_tokens_seen": 16515072000,
|
| 11230 |
"num_train_epochs": 1,
|
| 11231 |
"save_steps": 1000,
|
| 11232 |
"stateful_callbacks": {
|
|
|
|
| 11241 |
"attributes": {}
|
| 11242 |
}
|
| 11243 |
},
|
| 11244 |
+
"total_flos": 4.41794294710272e+18,
|
| 11245 |
"train_batch_size": 64,
|
| 11246 |
"trial_name": null,
|
| 11247 |
"trial_params": null
|