Training in progress, step 5000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 536223056
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e6e1468ef363199a8ce8dceeee806e0cd1265dabba9569f802d5e0ffdf55cf29
|
| 3 |
size 536223056
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1072594443
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:979fd7f70ce82e647328d9ca181635fd358343ae3c4356518a994deb8d2c7554
|
| 3 |
size 1072594443
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14645
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
|
| 3 |
size 14645
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c8c6451e983e45b2059a969443ca799e62ce60a9d34862e6b02e6b5034f66233
|
| 3 |
size 1465
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch":
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -4058,6 +4058,456 @@
|
|
| 4058 |
"mean_token_accuracy": 0.8125901579856872,
|
| 4059 |
"num_tokens": 4982012.0,
|
| 4060 |
"step": 4500
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4061 |
}
|
| 4062 |
],
|
| 4063 |
"logging_steps": 10,
|
|
@@ -4077,7 +4527,7 @@
|
|
| 4077 |
"attributes": {}
|
| 4078 |
}
|
| 4079 |
},
|
| 4080 |
-
"total_flos":
|
| 4081 |
"train_batch_size": 8,
|
| 4082 |
"trial_name": null,
|
| 4083 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 1.007455168245013,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 5000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 4058 |
"mean_token_accuracy": 0.8125901579856872,
|
| 4059 |
"num_tokens": 4982012.0,
|
| 4060 |
"step": 4500
|
| 4061 |
+
},
|
| 4062 |
+
{
|
| 4063 |
+
"epoch": 0.9087245617570018,
|
| 4064 |
+
"grad_norm": 10.875,
|
| 4065 |
+
"learning_rate": 1.3943179528510981e-05,
|
| 4066 |
+
"loss": 0.9166,
|
| 4067 |
+
"mean_token_accuracy": 0.7789366781711579,
|
| 4068 |
+
"num_tokens": 4992691.0,
|
| 4069 |
+
"step": 4510
|
| 4070 |
+
},
|
| 4071 |
+
{
|
| 4072 |
+
"epoch": 0.9107394720934918,
|
| 4073 |
+
"grad_norm": 10.6875,
|
| 4074 |
+
"learning_rate": 1.3929746792934382e-05,
|
| 4075 |
+
"loss": 0.8536,
|
| 4076 |
+
"mean_token_accuracy": 0.7905638337135314,
|
| 4077 |
+
"num_tokens": 5004578.0,
|
| 4078 |
+
"step": 4520
|
| 4079 |
+
},
|
| 4080 |
+
{
|
| 4081 |
+
"epoch": 0.9127543824299819,
|
| 4082 |
+
"grad_norm": 14.25,
|
| 4083 |
+
"learning_rate": 1.3916314057357782e-05,
|
| 4084 |
+
"loss": 0.976,
|
| 4085 |
+
"mean_token_accuracy": 0.7722863137722016,
|
| 4086 |
+
"num_tokens": 5015364.0,
|
| 4087 |
+
"step": 4530
|
| 4088 |
+
},
|
| 4089 |
+
{
|
| 4090 |
+
"epoch": 0.9147692927664719,
|
| 4091 |
+
"grad_norm": 12.9375,
|
| 4092 |
+
"learning_rate": 1.3902881321781181e-05,
|
| 4093 |
+
"loss": 0.8254,
|
| 4094 |
+
"mean_token_accuracy": 0.7985598504543304,
|
| 4095 |
+
"num_tokens": 5026777.0,
|
| 4096 |
+
"step": 4540
|
| 4097 |
+
},
|
| 4098 |
+
{
|
| 4099 |
+
"epoch": 0.9167842031029619,
|
| 4100 |
+
"grad_norm": 14.3125,
|
| 4101 |
+
"learning_rate": 1.3889448586204582e-05,
|
| 4102 |
+
"loss": 0.8435,
|
| 4103 |
+
"mean_token_accuracy": 0.7964738607406616,
|
| 4104 |
+
"num_tokens": 5038156.0,
|
| 4105 |
+
"step": 4550
|
| 4106 |
+
},
|
| 4107 |
+
{
|
| 4108 |
+
"epoch": 0.918799113439452,
|
| 4109 |
+
"grad_norm": 10.8125,
|
| 4110 |
+
"learning_rate": 1.3876015850627982e-05,
|
| 4111 |
+
"loss": 0.9532,
|
| 4112 |
+
"mean_token_accuracy": 0.7702659487724304,
|
| 4113 |
+
"num_tokens": 5049182.0,
|
| 4114 |
+
"step": 4560
|
| 4115 |
+
},
|
| 4116 |
+
{
|
| 4117 |
+
"epoch": 0.920814023775942,
|
| 4118 |
+
"grad_norm": 10.375,
|
| 4119 |
+
"learning_rate": 1.3862583115051383e-05,
|
| 4120 |
+
"loss": 0.749,
|
| 4121 |
+
"mean_token_accuracy": 0.8094150185585022,
|
| 4122 |
+
"num_tokens": 5060508.0,
|
| 4123 |
+
"step": 4570
|
| 4124 |
+
},
|
| 4125 |
+
{
|
| 4126 |
+
"epoch": 0.9228289341124319,
|
| 4127 |
+
"grad_norm": 11.1875,
|
| 4128 |
+
"learning_rate": 1.384915037947478e-05,
|
| 4129 |
+
"loss": 0.9101,
|
| 4130 |
+
"mean_token_accuracy": 0.780214524269104,
|
| 4131 |
+
"num_tokens": 5072338.0,
|
| 4132 |
+
"step": 4580
|
| 4133 |
+
},
|
| 4134 |
+
{
|
| 4135 |
+
"epoch": 0.924843844448922,
|
| 4136 |
+
"grad_norm": 11.0625,
|
| 4137 |
+
"learning_rate": 1.383571764389818e-05,
|
| 4138 |
+
"loss": 0.8272,
|
| 4139 |
+
"mean_token_accuracy": 0.7955898463726043,
|
| 4140 |
+
"num_tokens": 5083561.0,
|
| 4141 |
+
"step": 4590
|
| 4142 |
+
},
|
| 4143 |
+
{
|
| 4144 |
+
"epoch": 0.926858754785412,
|
| 4145 |
+
"grad_norm": 11.125,
|
| 4146 |
+
"learning_rate": 1.382228490832158e-05,
|
| 4147 |
+
"loss": 0.8138,
|
| 4148 |
+
"mean_token_accuracy": 0.797929847240448,
|
| 4149 |
+
"num_tokens": 5095032.0,
|
| 4150 |
+
"step": 4600
|
| 4151 |
+
},
|
| 4152 |
+
{
|
| 4153 |
+
"epoch": 0.9288736651219021,
|
| 4154 |
+
"grad_norm": 13.5,
|
| 4155 |
+
"learning_rate": 1.380885217274498e-05,
|
| 4156 |
+
"loss": 0.8166,
|
| 4157 |
+
"mean_token_accuracy": 0.7943983316421509,
|
| 4158 |
+
"num_tokens": 5105707.0,
|
| 4159 |
+
"step": 4610
|
| 4160 |
+
},
|
| 4161 |
+
{
|
| 4162 |
+
"epoch": 0.9308885754583921,
|
| 4163 |
+
"grad_norm": 12.0625,
|
| 4164 |
+
"learning_rate": 1.379541943716838e-05,
|
| 4165 |
+
"loss": 0.8926,
|
| 4166 |
+
"mean_token_accuracy": 0.7872261703014374,
|
| 4167 |
+
"num_tokens": 5116277.0,
|
| 4168 |
+
"step": 4620
|
| 4169 |
+
},
|
| 4170 |
+
{
|
| 4171 |
+
"epoch": 0.9329034857948821,
|
| 4172 |
+
"grad_norm": 10.125,
|
| 4173 |
+
"learning_rate": 1.378198670159178e-05,
|
| 4174 |
+
"loss": 0.8729,
|
| 4175 |
+
"mean_token_accuracy": 0.7849249064922332,
|
| 4176 |
+
"num_tokens": 5128524.0,
|
| 4177 |
+
"step": 4630
|
| 4178 |
+
},
|
| 4179 |
+
{
|
| 4180 |
+
"epoch": 0.9349183961313722,
|
| 4181 |
+
"grad_norm": 10.625,
|
| 4182 |
+
"learning_rate": 1.376855396601518e-05,
|
| 4183 |
+
"loss": 0.8558,
|
| 4184 |
+
"mean_token_accuracy": 0.7900417923927308,
|
| 4185 |
+
"num_tokens": 5139328.0,
|
| 4186 |
+
"step": 4640
|
| 4187 |
+
},
|
| 4188 |
+
{
|
| 4189 |
+
"epoch": 0.9369333064678622,
|
| 4190 |
+
"grad_norm": 10.625,
|
| 4191 |
+
"learning_rate": 1.375512123043858e-05,
|
| 4192 |
+
"loss": 0.8806,
|
| 4193 |
+
"mean_token_accuracy": 0.7793081521987915,
|
| 4194 |
+
"num_tokens": 5152047.0,
|
| 4195 |
+
"step": 4650
|
| 4196 |
+
},
|
| 4197 |
+
{
|
| 4198 |
+
"epoch": 0.9389482168043523,
|
| 4199 |
+
"grad_norm": 10.6875,
|
| 4200 |
+
"learning_rate": 1.374168849486198e-05,
|
| 4201 |
+
"loss": 0.9167,
|
| 4202 |
+
"mean_token_accuracy": 0.7699385344982147,
|
| 4203 |
+
"num_tokens": 5163236.0,
|
| 4204 |
+
"step": 4660
|
| 4205 |
+
},
|
| 4206 |
+
{
|
| 4207 |
+
"epoch": 0.9409631271408422,
|
| 4208 |
+
"grad_norm": 12.6875,
|
| 4209 |
+
"learning_rate": 1.3728255759285381e-05,
|
| 4210 |
+
"loss": 0.778,
|
| 4211 |
+
"mean_token_accuracy": 0.8066167533397675,
|
| 4212 |
+
"num_tokens": 5173182.0,
|
| 4213 |
+
"step": 4670
|
| 4214 |
+
},
|
| 4215 |
+
{
|
| 4216 |
+
"epoch": 0.9429780374773322,
|
| 4217 |
+
"grad_norm": 10.8125,
|
| 4218 |
+
"learning_rate": 1.3714823023708778e-05,
|
| 4219 |
+
"loss": 0.8731,
|
| 4220 |
+
"mean_token_accuracy": 0.7907391846179962,
|
| 4221 |
+
"num_tokens": 5184212.0,
|
| 4222 |
+
"step": 4680
|
| 4223 |
+
},
|
| 4224 |
+
{
|
| 4225 |
+
"epoch": 0.9449929478138223,
|
| 4226 |
+
"grad_norm": 14.1875,
|
| 4227 |
+
"learning_rate": 1.3701390288132179e-05,
|
| 4228 |
+
"loss": 0.8561,
|
| 4229 |
+
"mean_token_accuracy": 0.7872300326824189,
|
| 4230 |
+
"num_tokens": 5195178.0,
|
| 4231 |
+
"step": 4690
|
| 4232 |
+
},
|
| 4233 |
+
{
|
| 4234 |
+
"epoch": 0.9470078581503123,
|
| 4235 |
+
"grad_norm": 10.0625,
|
| 4236 |
+
"learning_rate": 1.368795755255558e-05,
|
| 4237 |
+
"loss": 0.8959,
|
| 4238 |
+
"mean_token_accuracy": 0.785253643989563,
|
| 4239 |
+
"num_tokens": 5206174.0,
|
| 4240 |
+
"step": 4700
|
| 4241 |
+
},
|
| 4242 |
+
{
|
| 4243 |
+
"epoch": 0.9490227684868023,
|
| 4244 |
+
"grad_norm": 10.1875,
|
| 4245 |
+
"learning_rate": 1.3674524816978978e-05,
|
| 4246 |
+
"loss": 0.82,
|
| 4247 |
+
"mean_token_accuracy": 0.8018651187419892,
|
| 4248 |
+
"num_tokens": 5216839.0,
|
| 4249 |
+
"step": 4710
|
| 4250 |
+
},
|
| 4251 |
+
{
|
| 4252 |
+
"epoch": 0.9510376788232924,
|
| 4253 |
+
"grad_norm": 12.375,
|
| 4254 |
+
"learning_rate": 1.3661092081402379e-05,
|
| 4255 |
+
"loss": 1.0637,
|
| 4256 |
+
"mean_token_accuracy": 0.7501500964164733,
|
| 4257 |
+
"num_tokens": 5228271.0,
|
| 4258 |
+
"step": 4720
|
| 4259 |
+
},
|
| 4260 |
+
{
|
| 4261 |
+
"epoch": 0.9530525891597824,
|
| 4262 |
+
"grad_norm": 12.625,
|
| 4263 |
+
"learning_rate": 1.364765934582578e-05,
|
| 4264 |
+
"loss": 0.9295,
|
| 4265 |
+
"mean_token_accuracy": 0.7782152414321899,
|
| 4266 |
+
"num_tokens": 5238828.0,
|
| 4267 |
+
"step": 4730
|
| 4268 |
+
},
|
| 4269 |
+
{
|
| 4270 |
+
"epoch": 0.9550674994962725,
|
| 4271 |
+
"grad_norm": 12.0625,
|
| 4272 |
+
"learning_rate": 1.363422661024918e-05,
|
| 4273 |
+
"loss": 0.9794,
|
| 4274 |
+
"mean_token_accuracy": 0.7655089437961579,
|
| 4275 |
+
"num_tokens": 5250865.0,
|
| 4276 |
+
"step": 4740
|
| 4277 |
+
},
|
| 4278 |
+
{
|
| 4279 |
+
"epoch": 0.9570824098327625,
|
| 4280 |
+
"grad_norm": 12.75,
|
| 4281 |
+
"learning_rate": 1.3620793874672577e-05,
|
| 4282 |
+
"loss": 0.8559,
|
| 4283 |
+
"mean_token_accuracy": 0.783417934179306,
|
| 4284 |
+
"num_tokens": 5261161.0,
|
| 4285 |
+
"step": 4750
|
| 4286 |
+
},
|
| 4287 |
+
{
|
| 4288 |
+
"epoch": 0.9590973201692524,
|
| 4289 |
+
"grad_norm": 13.125,
|
| 4290 |
+
"learning_rate": 1.3607361139095977e-05,
|
| 4291 |
+
"loss": 0.8129,
|
| 4292 |
+
"mean_token_accuracy": 0.8023806989192963,
|
| 4293 |
+
"num_tokens": 5271735.0,
|
| 4294 |
+
"step": 4760
|
| 4295 |
+
},
|
| 4296 |
+
{
|
| 4297 |
+
"epoch": 0.9611122305057425,
|
| 4298 |
+
"grad_norm": 10.3125,
|
| 4299 |
+
"learning_rate": 1.3593928403519378e-05,
|
| 4300 |
+
"loss": 0.9024,
|
| 4301 |
+
"mean_token_accuracy": 0.7816856026649475,
|
| 4302 |
+
"num_tokens": 5282948.0,
|
| 4303 |
+
"step": 4770
|
| 4304 |
+
},
|
| 4305 |
+
{
|
| 4306 |
+
"epoch": 0.9631271408422325,
|
| 4307 |
+
"grad_norm": 16.375,
|
| 4308 |
+
"learning_rate": 1.3580495667942777e-05,
|
| 4309 |
+
"loss": 0.7899,
|
| 4310 |
+
"mean_token_accuracy": 0.8060350120067596,
|
| 4311 |
+
"num_tokens": 5293161.0,
|
| 4312 |
+
"step": 4780
|
| 4313 |
+
},
|
| 4314 |
+
{
|
| 4315 |
+
"epoch": 0.9651420511787225,
|
| 4316 |
+
"grad_norm": 12.0625,
|
| 4317 |
+
"learning_rate": 1.3567062932366177e-05,
|
| 4318 |
+
"loss": 0.9497,
|
| 4319 |
+
"mean_token_accuracy": 0.7778710544109344,
|
| 4320 |
+
"num_tokens": 5303823.0,
|
| 4321 |
+
"step": 4790
|
| 4322 |
+
},
|
| 4323 |
+
{
|
| 4324 |
+
"epoch": 0.9671569615152126,
|
| 4325 |
+
"grad_norm": 12.375,
|
| 4326 |
+
"learning_rate": 1.3553630196789578e-05,
|
| 4327 |
+
"loss": 0.8374,
|
| 4328 |
+
"mean_token_accuracy": 0.7918814778327942,
|
| 4329 |
+
"num_tokens": 5314457.0,
|
| 4330 |
+
"step": 4800
|
| 4331 |
+
},
|
| 4332 |
+
{
|
| 4333 |
+
"epoch": 0.9691718718517026,
|
| 4334 |
+
"grad_norm": 13.4375,
|
| 4335 |
+
"learning_rate": 1.3540197461212977e-05,
|
| 4336 |
+
"loss": 1.0195,
|
| 4337 |
+
"mean_token_accuracy": 0.7530766189098358,
|
| 4338 |
+
"num_tokens": 5324766.0,
|
| 4339 |
+
"step": 4810
|
| 4340 |
+
},
|
| 4341 |
+
{
|
| 4342 |
+
"epoch": 0.9711867821881927,
|
| 4343 |
+
"grad_norm": 12.8125,
|
| 4344 |
+
"learning_rate": 1.3526764725636377e-05,
|
| 4345 |
+
"loss": 0.8813,
|
| 4346 |
+
"mean_token_accuracy": 0.7811478495597839,
|
| 4347 |
+
"num_tokens": 5336093.0,
|
| 4348 |
+
"step": 4820
|
| 4349 |
+
},
|
| 4350 |
+
{
|
| 4351 |
+
"epoch": 0.9732016925246827,
|
| 4352 |
+
"grad_norm": 10.4375,
|
| 4353 |
+
"learning_rate": 1.3513331990059778e-05,
|
| 4354 |
+
"loss": 0.8947,
|
| 4355 |
+
"mean_token_accuracy": 0.7876034200191497,
|
| 4356 |
+
"num_tokens": 5346706.0,
|
| 4357 |
+
"step": 4830
|
| 4358 |
+
},
|
| 4359 |
+
{
|
| 4360 |
+
"epoch": 0.9752166028611727,
|
| 4361 |
+
"grad_norm": 15.25,
|
| 4362 |
+
"learning_rate": 1.3499899254483178e-05,
|
| 4363 |
+
"loss": 0.8773,
|
| 4364 |
+
"mean_token_accuracy": 0.7921059668064118,
|
| 4365 |
+
"num_tokens": 5357534.0,
|
| 4366 |
+
"step": 4840
|
| 4367 |
+
},
|
| 4368 |
+
{
|
| 4369 |
+
"epoch": 0.9772315131976627,
|
| 4370 |
+
"grad_norm": 10.75,
|
| 4371 |
+
"learning_rate": 1.3486466518906575e-05,
|
| 4372 |
+
"loss": 0.895,
|
| 4373 |
+
"mean_token_accuracy": 0.7818942189216613,
|
| 4374 |
+
"num_tokens": 5370003.0,
|
| 4375 |
+
"step": 4850
|
| 4376 |
+
},
|
| 4377 |
+
{
|
| 4378 |
+
"epoch": 0.9792464235341527,
|
| 4379 |
+
"grad_norm": 11.3125,
|
| 4380 |
+
"learning_rate": 1.3473033783329976e-05,
|
| 4381 |
+
"loss": 0.7589,
|
| 4382 |
+
"mean_token_accuracy": 0.8098963499069214,
|
| 4383 |
+
"num_tokens": 5381355.0,
|
| 4384 |
+
"step": 4860
|
| 4385 |
+
},
|
| 4386 |
+
{
|
| 4387 |
+
"epoch": 0.9812613338706427,
|
| 4388 |
+
"grad_norm": 12.0625,
|
| 4389 |
+
"learning_rate": 1.3459601047753376e-05,
|
| 4390 |
+
"loss": 0.793,
|
| 4391 |
+
"mean_token_accuracy": 0.8019460260868072,
|
| 4392 |
+
"num_tokens": 5392541.0,
|
| 4393 |
+
"step": 4870
|
| 4394 |
+
},
|
| 4395 |
+
{
|
| 4396 |
+
"epoch": 0.9832762442071328,
|
| 4397 |
+
"grad_norm": 14.5,
|
| 4398 |
+
"learning_rate": 1.3446168312176775e-05,
|
| 4399 |
+
"loss": 0.9046,
|
| 4400 |
+
"mean_token_accuracy": 0.781462025642395,
|
| 4401 |
+
"num_tokens": 5403838.0,
|
| 4402 |
+
"step": 4880
|
| 4403 |
+
},
|
| 4404 |
+
{
|
| 4405 |
+
"epoch": 0.9852911545436228,
|
| 4406 |
+
"grad_norm": 11.25,
|
| 4407 |
+
"learning_rate": 1.3432735576600176e-05,
|
| 4408 |
+
"loss": 0.9039,
|
| 4409 |
+
"mean_token_accuracy": 0.7840433418750763,
|
| 4410 |
+
"num_tokens": 5414995.0,
|
| 4411 |
+
"step": 4890
|
| 4412 |
+
},
|
| 4413 |
+
{
|
| 4414 |
+
"epoch": 0.9873060648801129,
|
| 4415 |
+
"grad_norm": 9.5,
|
| 4416 |
+
"learning_rate": 1.3419302841023576e-05,
|
| 4417 |
+
"loss": 0.8909,
|
| 4418 |
+
"mean_token_accuracy": 0.7859670460224152,
|
| 4419 |
+
"num_tokens": 5426001.0,
|
| 4420 |
+
"step": 4900
|
| 4421 |
+
},
|
| 4422 |
+
{
|
| 4423 |
+
"epoch": 0.9893209752166029,
|
| 4424 |
+
"grad_norm": 13.375,
|
| 4425 |
+
"learning_rate": 1.3405870105446977e-05,
|
| 4426 |
+
"loss": 0.8171,
|
| 4427 |
+
"mean_token_accuracy": 0.7865999937057495,
|
| 4428 |
+
"num_tokens": 5438081.0,
|
| 4429 |
+
"step": 4910
|
| 4430 |
+
},
|
| 4431 |
+
{
|
| 4432 |
+
"epoch": 0.9913358855530929,
|
| 4433 |
+
"grad_norm": 14.3125,
|
| 4434 |
+
"learning_rate": 1.3392437369870374e-05,
|
| 4435 |
+
"loss": 0.9477,
|
| 4436 |
+
"mean_token_accuracy": 0.773062938451767,
|
| 4437 |
+
"num_tokens": 5449464.0,
|
| 4438 |
+
"step": 4920
|
| 4439 |
+
},
|
| 4440 |
+
{
|
| 4441 |
+
"epoch": 0.9933507958895829,
|
| 4442 |
+
"grad_norm": 11.3125,
|
| 4443 |
+
"learning_rate": 1.3379004634293775e-05,
|
| 4444 |
+
"loss": 0.9291,
|
| 4445 |
+
"mean_token_accuracy": 0.7776412189006805,
|
| 4446 |
+
"num_tokens": 5461080.0,
|
| 4447 |
+
"step": 4930
|
| 4448 |
+
},
|
| 4449 |
+
{
|
| 4450 |
+
"epoch": 0.9953657062260729,
|
| 4451 |
+
"grad_norm": 11.625,
|
| 4452 |
+
"learning_rate": 1.3365571898717175e-05,
|
| 4453 |
+
"loss": 0.8828,
|
| 4454 |
+
"mean_token_accuracy": 0.7811066091060639,
|
| 4455 |
+
"num_tokens": 5472144.0,
|
| 4456 |
+
"step": 4940
|
| 4457 |
+
},
|
| 4458 |
+
{
|
| 4459 |
+
"epoch": 0.9973806165625629,
|
| 4460 |
+
"grad_norm": 9.875,
|
| 4461 |
+
"learning_rate": 1.3352139163140574e-05,
|
| 4462 |
+
"loss": 0.8439,
|
| 4463 |
+
"mean_token_accuracy": 0.7922865450382233,
|
| 4464 |
+
"num_tokens": 5484117.0,
|
| 4465 |
+
"step": 4950
|
| 4466 |
+
},
|
| 4467 |
+
{
|
| 4468 |
+
"epoch": 0.999395526899053,
|
| 4469 |
+
"grad_norm": 10.5,
|
| 4470 |
+
"learning_rate": 1.3338706427563974e-05,
|
| 4471 |
+
"loss": 0.8985,
|
| 4472 |
+
"mean_token_accuracy": 0.7755892872810364,
|
| 4473 |
+
"num_tokens": 5495916.0,
|
| 4474 |
+
"step": 4960
|
| 4475 |
+
},
|
| 4476 |
+
{
|
| 4477 |
+
"epoch": 1.001410437235543,
|
| 4478 |
+
"grad_norm": 12.875,
|
| 4479 |
+
"learning_rate": 1.3325273691987375e-05,
|
| 4480 |
+
"loss": 0.8131,
|
| 4481 |
+
"mean_token_accuracy": 0.7963548183441163,
|
| 4482 |
+
"num_tokens": 5506891.0,
|
| 4483 |
+
"step": 4970
|
| 4484 |
+
},
|
| 4485 |
+
{
|
| 4486 |
+
"epoch": 1.003425347572033,
|
| 4487 |
+
"grad_norm": 10.3125,
|
| 4488 |
+
"learning_rate": 1.3311840956410774e-05,
|
| 4489 |
+
"loss": 0.7879,
|
| 4490 |
+
"mean_token_accuracy": 0.7989233016967774,
|
| 4491 |
+
"num_tokens": 5519454.0,
|
| 4492 |
+
"step": 4980
|
| 4493 |
+
},
|
| 4494 |
+
{
|
| 4495 |
+
"epoch": 1.005440257908523,
|
| 4496 |
+
"grad_norm": 11.875,
|
| 4497 |
+
"learning_rate": 1.3298408220834174e-05,
|
| 4498 |
+
"loss": 0.7878,
|
| 4499 |
+
"mean_token_accuracy": 0.8067593216896057,
|
| 4500 |
+
"num_tokens": 5529707.0,
|
| 4501 |
+
"step": 4990
|
| 4502 |
+
},
|
| 4503 |
+
{
|
| 4504 |
+
"epoch": 1.007455168245013,
|
| 4505 |
+
"grad_norm": 11.1875,
|
| 4506 |
+
"learning_rate": 1.3284975485257575e-05,
|
| 4507 |
+
"loss": 0.7955,
|
| 4508 |
+
"mean_token_accuracy": 0.800259780883789,
|
| 4509 |
+
"num_tokens": 5541015.0,
|
| 4510 |
+
"step": 5000
|
| 4511 |
}
|
| 4512 |
],
|
| 4513 |
"logging_steps": 10,
|
|
|
|
| 4527 |
"attributes": {}
|
| 4528 |
}
|
| 4529 |
},
|
| 4530 |
+
"total_flos": 6697551334397952.0,
|
| 4531 |
"train_batch_size": 8,
|
| 4532 |
"trial_name": null,
|
| 4533 |
"trial_params": null
|