Training in progress, step 63000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 301235464
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a92901e6dc98a2f43e5ab06e2e35886c7f4c68e401e8be0d01acd281cd82349c
|
| 3 |
size 301235464
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 602335994
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:34104db96694bb116cf3048bcf68919612a7c6c79ff646c13c6e8d5a81aff8f6
|
| 3 |
size 602335994
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1a97095234a7b82e99cd1b23ba4db26c35942b8b4622876b166d0ce65b7c7110
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7d1de5c681ac3c8b6bb5235a71c5b6efd72fc9171aa2c9c6e093b8695c8a08b8
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -11044,11 +11044,189 @@
|
|
| 11044 |
"eval_steps_per_second": 23.306,
|
| 11045 |
"num_input_tokens_seen": 16252923456,
|
| 11046 |
"step": 62000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11047 |
}
|
| 11048 |
],
|
| 11049 |
"logging_steps": 50,
|
| 11050 |
"max_steps": 70000,
|
| 11051 |
-
"num_input_tokens_seen":
|
| 11052 |
"num_train_epochs": 1,
|
| 11053 |
"save_steps": 1000,
|
| 11054 |
"stateful_callbacks": {
|
|
@@ -11063,7 +11241,7 @@
|
|
| 11063 |
"attributes": {}
|
| 11064 |
}
|
| 11065 |
},
|
| 11066 |
-
"total_flos": 4.
|
| 11067 |
"train_batch_size": 64,
|
| 11068 |
"trial_name": null,
|
| 11069 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.30051158519861193,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 63000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 11044 |
"eval_steps_per_second": 23.306,
|
| 11045 |
"num_input_tokens_seen": 16252923456,
|
| 11046 |
"step": 62000
|
| 11047 |
+
},
|
| 11048 |
+
{
|
| 11049 |
+
"epoch": 0.29598006129482335,
|
| 11050 |
+
"grad_norm": 0.1945638656616211,
|
| 11051 |
+
"learning_rate": 0.0006057842458386314,
|
| 11052 |
+
"loss": 2.5582,
|
| 11053 |
+
"num_input_tokens_seen": 16266030656,
|
| 11054 |
+
"step": 62050
|
| 11055 |
+
},
|
| 11056 |
+
{
|
| 11057 |
+
"epoch": 0.29621856255291745,
|
| 11058 |
+
"grad_norm": 0.201882466673851,
|
| 11059 |
+
"learning_rate": 0.0006002947078916364,
|
| 11060 |
+
"loss": 2.5764,
|
| 11061 |
+
"num_input_tokens_seen": 16279137856,
|
| 11062 |
+
"step": 62100
|
| 11063 |
+
},
|
| 11064 |
+
{
|
| 11065 |
+
"epoch": 0.2964570638110116,
|
| 11066 |
+
"grad_norm": 0.2137998789548874,
|
| 11067 |
+
"learning_rate": 0.0005947925441958392,
|
| 11068 |
+
"loss": 2.5689,
|
| 11069 |
+
"num_input_tokens_seen": 16292245056,
|
| 11070 |
+
"step": 62150
|
| 11071 |
+
},
|
| 11072 |
+
{
|
| 11073 |
+
"epoch": 0.2966955650691057,
|
| 11074 |
+
"grad_norm": 0.18265672028064728,
|
| 11075 |
+
"learning_rate": 0.0005892784473993184,
|
| 11076 |
+
"loss": 2.5741,
|
| 11077 |
+
"num_input_tokens_seen": 16305352256,
|
| 11078 |
+
"step": 62200
|
| 11079 |
+
},
|
| 11080 |
+
{
|
| 11081 |
+
"epoch": 0.2969340663271999,
|
| 11082 |
+
"grad_norm": 0.16944251954555511,
|
| 11083 |
+
"learning_rate": 0.0005837531116523682,
|
| 11084 |
+
"loss": 2.5537,
|
| 11085 |
+
"num_input_tokens_seen": 16318459456,
|
| 11086 |
+
"step": 62250
|
| 11087 |
+
},
|
| 11088 |
+
{
|
| 11089 |
+
"epoch": 0.29717256758529403,
|
| 11090 |
+
"grad_norm": 0.20273485779762268,
|
| 11091 |
+
"learning_rate": 0.0005782172325201155,
|
| 11092 |
+
"loss": 2.5512,
|
| 11093 |
+
"num_input_tokens_seen": 16331566656,
|
| 11094 |
+
"step": 62300
|
| 11095 |
+
},
|
| 11096 |
+
{
|
| 11097 |
+
"epoch": 0.29741106884338814,
|
| 11098 |
+
"grad_norm": 0.19320476055145264,
|
| 11099 |
+
"learning_rate": 0.0005726715068949564,
|
| 11100 |
+
"loss": 2.5823,
|
| 11101 |
+
"num_input_tokens_seen": 16344673856,
|
| 11102 |
+
"step": 62350
|
| 11103 |
+
},
|
| 11104 |
+
{
|
| 11105 |
+
"epoch": 0.2976495701014823,
|
| 11106 |
+
"grad_norm": 0.21321871876716614,
|
| 11107 |
+
"learning_rate": 0.0005671166329088278,
|
| 11108 |
+
"loss": 2.5608,
|
| 11109 |
+
"num_input_tokens_seen": 16357781056,
|
| 11110 |
+
"step": 62400
|
| 11111 |
+
},
|
| 11112 |
+
{
|
| 11113 |
+
"epoch": 0.2978880713595764,
|
| 11114 |
+
"grad_norm": 0.2007117122411728,
|
| 11115 |
+
"learning_rate": 0.0005615533098453215,
|
| 11116 |
+
"loss": 2.5685,
|
| 11117 |
+
"num_input_tokens_seen": 16370888256,
|
| 11118 |
+
"step": 62450
|
| 11119 |
+
},
|
| 11120 |
+
{
|
| 11121 |
+
"epoch": 0.29812657261767056,
|
| 11122 |
+
"grad_norm": 0.1896267682313919,
|
| 11123 |
+
"learning_rate": 0.0005559822380516539,
|
| 11124 |
+
"loss": 2.56,
|
| 11125 |
+
"num_input_tokens_seen": 16383995456,
|
| 11126 |
+
"step": 62500
|
| 11127 |
+
},
|
| 11128 |
+
{
|
| 11129 |
+
"epoch": 0.29812657261767056,
|
| 11130 |
+
"eval_loss": 2.448042154312134,
|
| 11131 |
+
"eval_runtime": 54.1994,
|
| 11132 |
+
"eval_samples_per_second": 92.252,
|
| 11133 |
+
"eval_steps_per_second": 23.063,
|
| 11134 |
+
"num_input_tokens_seen": 16383995456,
|
| 11135 |
+
"step": 62500
|
| 11136 |
+
},
|
| 11137 |
+
{
|
| 11138 |
+
"epoch": 0.2983650738757647,
|
| 11139 |
+
"grad_norm": 0.18581034243106842,
|
| 11140 |
+
"learning_rate": 0.0005504041188505022,
|
| 11141 |
+
"loss": 2.5691,
|
| 11142 |
+
"num_input_tokens_seen": 16397102656,
|
| 11143 |
+
"step": 62550
|
| 11144 |
+
},
|
| 11145 |
+
{
|
| 11146 |
+
"epoch": 0.2986035751338588,
|
| 11147 |
+
"grad_norm": 0.19272533059120178,
|
| 11148 |
+
"learning_rate": 0.0005448196544517168,
|
| 11149 |
+
"loss": 2.5635,
|
| 11150 |
+
"num_input_tokens_seen": 16410209856,
|
| 11151 |
+
"step": 62600
|
| 11152 |
+
},
|
| 11153 |
+
{
|
| 11154 |
+
"epoch": 0.298842076391953,
|
| 11155 |
+
"grad_norm": 0.19940300285816193,
|
| 11156 |
+
"learning_rate": 0.0005392295478639225,
|
| 11157 |
+
"loss": 2.5755,
|
| 11158 |
+
"num_input_tokens_seen": 16423317056,
|
| 11159 |
+
"step": 62650
|
| 11160 |
+
},
|
| 11161 |
+
{
|
| 11162 |
+
"epoch": 0.2990805776500471,
|
| 11163 |
+
"grad_norm": 0.18894875049591064,
|
| 11164 |
+
"learning_rate": 0.0005336345028060199,
|
| 11165 |
+
"loss": 2.5718,
|
| 11166 |
+
"num_input_tokens_seen": 16436424256,
|
| 11167 |
+
"step": 62700
|
| 11168 |
+
},
|
| 11169 |
+
{
|
| 11170 |
+
"epoch": 0.29931907890814125,
|
| 11171 |
+
"grad_norm": 0.19226962327957153,
|
| 11172 |
+
"learning_rate": 0.0005280352236185959,
|
| 11173 |
+
"loss": 2.563,
|
| 11174 |
+
"num_input_tokens_seen": 16449531456,
|
| 11175 |
+
"step": 62750
|
| 11176 |
+
},
|
| 11177 |
+
{
|
| 11178 |
+
"epoch": 0.2995575801662354,
|
| 11179 |
+
"grad_norm": 0.20716702938079834,
|
| 11180 |
+
"learning_rate": 0.0005224324151752575,
|
| 11181 |
+
"loss": 2.5532,
|
| 11182 |
+
"num_input_tokens_seen": 16462638656,
|
| 11183 |
+
"step": 62800
|
| 11184 |
+
},
|
| 11185 |
+
{
|
| 11186 |
+
"epoch": 0.2997960814243295,
|
| 11187 |
+
"grad_norm": 0.20232325792312622,
|
| 11188 |
+
"learning_rate": 0.000516826782793897,
|
| 11189 |
+
"loss": 2.5691,
|
| 11190 |
+
"num_input_tokens_seen": 16475745856,
|
| 11191 |
+
"step": 62850
|
| 11192 |
+
},
|
| 11193 |
+
{
|
| 11194 |
+
"epoch": 0.30003458268242367,
|
| 11195 |
+
"grad_norm": 0.19828926026821136,
|
| 11196 |
+
"learning_rate": 0.0005112190321479025,
|
| 11197 |
+
"loss": 2.5602,
|
| 11198 |
+
"num_input_tokens_seen": 16488853056,
|
| 11199 |
+
"step": 62900
|
| 11200 |
+
},
|
| 11201 |
+
{
|
| 11202 |
+
"epoch": 0.30027308394051777,
|
| 11203 |
+
"grad_norm": 0.22366905212402344,
|
| 11204 |
+
"learning_rate": 0.000505609869177323,
|
| 11205 |
+
"loss": 2.5556,
|
| 11206 |
+
"num_input_tokens_seen": 16501960256,
|
| 11207 |
+
"step": 62950
|
| 11208 |
+
},
|
| 11209 |
+
{
|
| 11210 |
+
"epoch": 0.30051158519861193,
|
| 11211 |
+
"grad_norm": 0.1883884221315384,
|
| 11212 |
+
"learning_rate": 0.0005,
|
| 11213 |
+
"loss": 2.5567,
|
| 11214 |
+
"num_input_tokens_seen": 16515067456,
|
| 11215 |
+
"step": 63000
|
| 11216 |
+
},
|
| 11217 |
+
{
|
| 11218 |
+
"epoch": 0.30051158519861193,
|
| 11219 |
+
"eval_loss": 2.4441678524017334,
|
| 11220 |
+
"eval_runtime": 54.2448,
|
| 11221 |
+
"eval_samples_per_second": 92.175,
|
| 11222 |
+
"eval_steps_per_second": 23.044,
|
| 11223 |
+
"num_input_tokens_seen": 16515067456,
|
| 11224 |
+
"step": 63000
|
| 11225 |
}
|
| 11226 |
],
|
| 11227 |
"logging_steps": 50,
|
| 11228 |
"max_steps": 70000,
|
| 11229 |
+
"num_input_tokens_seen": 16515067456,
|
| 11230 |
"num_train_epochs": 1,
|
| 11231 |
"save_steps": 1000,
|
| 11232 |
"stateful_callbacks": {
|
|
|
|
| 11241 |
"attributes": {}
|
| 11242 |
}
|
| 11243 |
},
|
| 11244 |
+
"total_flos": 4.4179417315383706e+18,
|
| 11245 |
"train_batch_size": 64,
|
| 11246 |
"trial_name": null,
|
| 11247 |
"trial_params": null
|