Training in progress, step 136000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1410301944
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2fe644242ac85364957a221ecb3fda251252bbb21f78dcf32d44ddb45cee4b8c
|
| 3 |
size 1410301944
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 2820185786
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9a0bb2637b2d27c703e80119c30822f6cacfac9cba885cfe1635772ce684b387
|
| 3 |
size 2820185786
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5c2ffcf5f582912b4a7016b15e29048dddaa402730efcd133059a2e08945301c
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ff50fa4a38896a05eab7dc1bfd456c8019098d112a942a25a411381c6596e51c
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 1.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -24038,11 +24038,189 @@
|
|
| 24038 |
"eval_steps_per_second": 15.14,
|
| 24039 |
"num_input_tokens_seen": 70767457344,
|
| 24040 |
"step": 135000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24041 |
}
|
| 24042 |
],
|
| 24043 |
"logging_steps": 50,
|
| 24044 |
"max_steps": 140000,
|
| 24045 |
-
"num_input_tokens_seen":
|
| 24046 |
"num_train_epochs": 2,
|
| 24047 |
"save_steps": 1000,
|
| 24048 |
"stateful_callbacks": {
|
|
@@ -24057,7 +24235,7 @@
|
|
| 24057 |
"attributes": {}
|
| 24058 |
}
|
| 24059 |
},
|
| 24060 |
-
"total_flos": 1.
|
| 24061 |
"train_batch_size": 32,
|
| 24062 |
"trial_name": null,
|
| 24063 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 1.297449229044683,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 136000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 24038 |
"eval_steps_per_second": 15.14,
|
| 24039 |
"num_input_tokens_seen": 70767457344,
|
| 24040 |
"step": 135000
|
| 24041 |
+
},
|
| 24042 |
+
{
|
| 24043 |
+
"epoch": 1.288386181237106,
|
| 24044 |
+
"grad_norm": 0.1237749382853508,
|
| 24045 |
+
"learning_rate": 7.515222372735647e-05,
|
| 24046 |
+
"loss": 2.029,
|
| 24047 |
+
"num_input_tokens_seen": 70793671744,
|
| 24048 |
+
"step": 135050
|
| 24049 |
+
},
|
| 24050 |
+
{
|
| 24051 |
+
"epoch": 1.2888631837532942,
|
| 24052 |
+
"grad_norm": 0.11638092249631882,
|
| 24053 |
+
"learning_rate": 7.367991782295391e-05,
|
| 24054 |
+
"loss": 2.0171,
|
| 24055 |
+
"num_input_tokens_seen": 70819879168,
|
| 24056 |
+
"step": 135100
|
| 24057 |
+
},
|
| 24058 |
+
{
|
| 24059 |
+
"epoch": 1.2893401862694827,
|
| 24060 |
+
"grad_norm": 0.11938998103141785,
|
| 24061 |
+
"learning_rate": 7.222102900887101e-05,
|
| 24062 |
+
"loss": 2.0232,
|
| 24063 |
+
"num_input_tokens_seen": 70846079616,
|
| 24064 |
+
"step": 135150
|
| 24065 |
+
},
|
| 24066 |
+
{
|
| 24067 |
+
"epoch": 1.2898171887856709,
|
| 24068 |
+
"grad_norm": 0.11985292285680771,
|
| 24069 |
+
"learning_rate": 7.077560319906695e-05,
|
| 24070 |
+
"loss": 2.0387,
|
| 24071 |
+
"num_input_tokens_seen": 70872294016,
|
| 24072 |
+
"step": 135200
|
| 24073 |
+
},
|
| 24074 |
+
{
|
| 24075 |
+
"epoch": 1.290294191301859,
|
| 24076 |
+
"grad_norm": 0.12651756405830383,
|
| 24077 |
+
"learning_rate": 6.934368588379552e-05,
|
| 24078 |
+
"loss": 2.0345,
|
| 24079 |
+
"num_input_tokens_seen": 70898498624,
|
| 24080 |
+
"step": 135250
|
| 24081 |
+
},
|
| 24082 |
+
{
|
| 24083 |
+
"epoch": 1.2907711938180473,
|
| 24084 |
+
"grad_norm": 0.12012086063623428,
|
| 24085 |
+
"learning_rate": 6.792532212817271e-05,
|
| 24086 |
+
"loss": 2.0362,
|
| 24087 |
+
"num_input_tokens_seen": 70924710048,
|
| 24088 |
+
"step": 135300
|
| 24089 |
+
},
|
| 24090 |
+
{
|
| 24091 |
+
"epoch": 1.2912481963342357,
|
| 24092 |
+
"grad_norm": 0.12295469641685486,
|
| 24093 |
+
"learning_rate": 6.652055657075845e-05,
|
| 24094 |
+
"loss": 2.0338,
|
| 24095 |
+
"num_input_tokens_seen": 70950915200,
|
| 24096 |
+
"step": 135350
|
| 24097 |
+
},
|
| 24098 |
+
{
|
| 24099 |
+
"epoch": 1.291725198850424,
|
| 24100 |
+
"grad_norm": 0.12192966043949127,
|
| 24101 |
+
"learning_rate": 6.512943342215233e-05,
|
| 24102 |
+
"loss": 2.0311,
|
| 24103 |
+
"num_input_tokens_seen": 70977118208,
|
| 24104 |
+
"step": 135400
|
| 24105 |
+
},
|
| 24106 |
+
{
|
| 24107 |
+
"epoch": 1.2922022013666123,
|
| 24108 |
+
"grad_norm": 0.1188386008143425,
|
| 24109 |
+
"learning_rate": 6.375199646360142e-05,
|
| 24110 |
+
"loss": 2.0311,
|
| 24111 |
+
"num_input_tokens_seen": 71003331520,
|
| 24112 |
+
"step": 135450
|
| 24113 |
+
},
|
| 24114 |
+
{
|
| 24115 |
+
"epoch": 1.2926792038828006,
|
| 24116 |
+
"grad_norm": 0.11646123230457306,
|
| 24117 |
+
"learning_rate": 6.238828904562316e-05,
|
| 24118 |
+
"loss": 2.037,
|
| 24119 |
+
"num_input_tokens_seen": 71029545920,
|
| 24120 |
+
"step": 135500
|
| 24121 |
+
},
|
| 24122 |
+
{
|
| 24123 |
+
"epoch": 1.2926792038828006,
|
| 24124 |
+
"eval_loss": 1.9530843496322632,
|
| 24125 |
+
"eval_runtime": 82.2362,
|
| 24126 |
+
"eval_samples_per_second": 60.8,
|
| 24127 |
+
"eval_steps_per_second": 15.2,
|
| 24128 |
+
"num_input_tokens_seen": 71029545920,
|
| 24129 |
+
"step": 135500
|
| 24130 |
+
},
|
| 24131 |
+
{
|
| 24132 |
+
"epoch": 1.2931562063989888,
|
| 24133 |
+
"grad_norm": 0.12359626591205597,
|
| 24134 |
+
"learning_rate": 6.103835408664032e-05,
|
| 24135 |
+
"loss": 2.0441,
|
| 24136 |
+
"num_input_tokens_seen": 71055753312,
|
| 24137 |
+
"step": 135550
|
| 24138 |
+
},
|
| 24139 |
+
{
|
| 24140 |
+
"epoch": 1.293633208915177,
|
| 24141 |
+
"grad_norm": 0.12097882479429245,
|
| 24142 |
+
"learning_rate": 5.9702234071631e-05,
|
| 24143 |
+
"loss": 2.0251,
|
| 24144 |
+
"num_input_tokens_seen": 71081964480,
|
| 24145 |
+
"step": 135600
|
| 24146 |
+
},
|
| 24147 |
+
{
|
| 24148 |
+
"epoch": 1.2941102114313652,
|
| 24149 |
+
"grad_norm": 0.11585067212581635,
|
| 24150 |
+
"learning_rate": 5.83799710507909e-05,
|
| 24151 |
+
"loss": 2.0352,
|
| 24152 |
+
"num_input_tokens_seen": 71108163424,
|
| 24153 |
+
"step": 135650
|
| 24154 |
+
},
|
| 24155 |
+
{
|
| 24156 |
+
"epoch": 1.2945872139475536,
|
| 24157 |
+
"grad_norm": 0.12164249271154404,
|
| 24158 |
+
"learning_rate": 5.7071606638210094e-05,
|
| 24159 |
+
"loss": 2.0314,
|
| 24160 |
+
"num_input_tokens_seen": 71134375424,
|
| 24161 |
+
"step": 135700
|
| 24162 |
+
},
|
| 24163 |
+
{
|
| 24164 |
+
"epoch": 1.2950642164637418,
|
| 24165 |
+
"grad_norm": 0.11601755023002625,
|
| 24166 |
+
"learning_rate": 5.577718201056392e-05,
|
| 24167 |
+
"loss": 2.0313,
|
| 24168 |
+
"num_input_tokens_seen": 71160582688,
|
| 24169 |
+
"step": 135750
|
| 24170 |
+
},
|
| 24171 |
+
{
|
| 24172 |
+
"epoch": 1.2955412189799302,
|
| 24173 |
+
"grad_norm": 0.11863810569047928,
|
| 24174 |
+
"learning_rate": 5.449673790581611e-05,
|
| 24175 |
+
"loss": 2.036,
|
| 24176 |
+
"num_input_tokens_seen": 71186792800,
|
| 24177 |
+
"step": 135800
|
| 24178 |
+
},
|
| 24179 |
+
{
|
| 24180 |
+
"epoch": 1.2960182214961184,
|
| 24181 |
+
"grad_norm": 0.12455905973911285,
|
| 24182 |
+
"learning_rate": 5.3230314621937556e-05,
|
| 24183 |
+
"loss": 2.0316,
|
| 24184 |
+
"num_input_tokens_seen": 71213000416,
|
| 24185 |
+
"step": 135850
|
| 24186 |
+
},
|
| 24187 |
+
{
|
| 24188 |
+
"epoch": 1.2964952240123067,
|
| 24189 |
+
"grad_norm": 0.11861378699541092,
|
| 24190 |
+
"learning_rate": 5.197795201563743e-05,
|
| 24191 |
+
"loss": 2.0334,
|
| 24192 |
+
"num_input_tokens_seen": 71239212224,
|
| 24193 |
+
"step": 135900
|
| 24194 |
+
},
|
| 24195 |
+
{
|
| 24196 |
+
"epoch": 1.2969722265284949,
|
| 24197 |
+
"grad_norm": 0.11894825845956802,
|
| 24198 |
+
"learning_rate": 5.073968950110941e-05,
|
| 24199 |
+
"loss": 2.028,
|
| 24200 |
+
"num_input_tokens_seen": 71265425728,
|
| 24201 |
+
"step": 135950
|
| 24202 |
+
},
|
| 24203 |
+
{
|
| 24204 |
+
"epoch": 1.297449229044683,
|
| 24205 |
+
"grad_norm": 0.11746333539485931,
|
| 24206 |
+
"learning_rate": 4.9515566048790485e-05,
|
| 24207 |
+
"loss": 2.0302,
|
| 24208 |
+
"num_input_tokens_seen": 71291638272,
|
| 24209 |
+
"step": 136000
|
| 24210 |
+
},
|
| 24211 |
+
{
|
| 24212 |
+
"epoch": 1.297449229044683,
|
| 24213 |
+
"eval_loss": 1.9527229070663452,
|
| 24214 |
+
"eval_runtime": 82.9319,
|
| 24215 |
+
"eval_samples_per_second": 60.29,
|
| 24216 |
+
"eval_steps_per_second": 15.073,
|
| 24217 |
+
"num_input_tokens_seen": 71291638272,
|
| 24218 |
+
"step": 136000
|
| 24219 |
}
|
| 24220 |
],
|
| 24221 |
"logging_steps": 50,
|
| 24222 |
"max_steps": 140000,
|
| 24223 |
+
"num_input_tokens_seen": 71291638272,
|
| 24224 |
"num_train_epochs": 2,
|
| 24225 |
"save_steps": 1000,
|
| 24226 |
"stateful_callbacks": {
|
|
|
|
| 24235 |
"attributes": {}
|
| 24236 |
}
|
| 24237 |
},
|
| 24238 |
+
"total_flos": 1.2617319614661919e+20,
|
| 24239 |
"train_batch_size": 32,
|
| 24240 |
"trial_name": null,
|
| 24241 |
"trial_params": null
|