Training in progress, step 2500, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 2066752
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:412ae50cdeb5cca99c6d46aab796b0711066e3d9f4b41a911e3eb9d3dc6de17f
|
| 3 |
size 2066752
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 4121235
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:836b014a8c276d1f5618f6f4807f34376d58eccee7d3467c4acd8af2f036f8f3
|
| 3 |
size 4121235
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14391
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ed9d71331f73f26faac079d17a5f8873c17bceffe8dbf3eb835123619d3824be
|
| 3 |
size 14391
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1401
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:28a833366aa970d3c976fd14c1ac36f1a287b5de565f4adb4a55d51debbe07ea
|
| 3 |
size 1401
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 100,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -17000,6 +17000,714 @@
|
|
| 17000 |
"eval_samples_per_second": 1.733,
|
| 17001 |
"eval_steps_per_second": 0.217,
|
| 17002 |
"step": 2400
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17003 |
}
|
| 17004 |
],
|
| 17005 |
"logging_steps": 1,
|
|
@@ -17019,7 +17727,7 @@
|
|
| 17019 |
"attributes": {}
|
| 17020 |
}
|
| 17021 |
},
|
| 17022 |
-
"total_flos":
|
| 17023 |
"train_batch_size": 1,
|
| 17024 |
"trial_name": null,
|
| 17025 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.10797270450030233,
|
| 6 |
"eval_steps": 100,
|
| 7 |
+
"global_step": 2500,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 17000 |
"eval_samples_per_second": 1.733,
|
| 17001 |
"eval_steps_per_second": 0.217,
|
| 17002 |
"step": 2400
|
| 17003 |
+
},
|
| 17004 |
+
{
|
| 17005 |
+
"epoch": 0.10369698540209035,
|
| 17006 |
+
"grad_norm": 0.70703125,
|
| 17007 |
+
"learning_rate": 0.0009921538538081587,
|
| 17008 |
+
"loss": 8.4378,
|
| 17009 |
+
"step": 2401
|
| 17010 |
+
},
|
| 17011 |
+
{
|
| 17012 |
+
"epoch": 0.10374017448389047,
|
| 17013 |
+
"grad_norm": 0.75,
|
| 17014 |
+
"learning_rate": 0.000992141247241211,
|
| 17015 |
+
"loss": 8.099,
|
| 17016 |
+
"step": 2402
|
| 17017 |
+
},
|
| 17018 |
+
{
|
| 17019 |
+
"epoch": 0.1037833635656906,
|
| 17020 |
+
"grad_norm": 0.47265625,
|
| 17021 |
+
"learning_rate": 0.0009921286306349944,
|
| 17022 |
+
"loss": 8.683,
|
| 17023 |
+
"step": 2403
|
| 17024 |
+
},
|
| 17025 |
+
{
|
| 17026 |
+
"epoch": 0.10382655264749072,
|
| 17027 |
+
"grad_norm": 1.484375,
|
| 17028 |
+
"learning_rate": 0.0009921160039897661,
|
| 17029 |
+
"loss": 8.6846,
|
| 17030 |
+
"step": 2404
|
| 17031 |
+
},
|
| 17032 |
+
{
|
| 17033 |
+
"epoch": 0.10386974172929084,
|
| 17034 |
+
"grad_norm": 0.80078125,
|
| 17035 |
+
"learning_rate": 0.000992103367305784,
|
| 17036 |
+
"loss": 7.8544,
|
| 17037 |
+
"step": 2405
|
| 17038 |
+
},
|
| 17039 |
+
{
|
| 17040 |
+
"epoch": 0.10391293081109096,
|
| 17041 |
+
"grad_norm": 0.5859375,
|
| 17042 |
+
"learning_rate": 0.0009920907205833056,
|
| 17043 |
+
"loss": 8.1833,
|
| 17044 |
+
"step": 2406
|
| 17045 |
+
},
|
| 17046 |
+
{
|
| 17047 |
+
"epoch": 0.10395611989289108,
|
| 17048 |
+
"grad_norm": 0.5703125,
|
| 17049 |
+
"learning_rate": 0.0009920780638225891,
|
| 17050 |
+
"loss": 8.2644,
|
| 17051 |
+
"step": 2407
|
| 17052 |
+
},
|
| 17053 |
+
{
|
| 17054 |
+
"epoch": 0.1039993089746912,
|
| 17055 |
+
"grad_norm": 0.72265625,
|
| 17056 |
+
"learning_rate": 0.0009920653970238924,
|
| 17057 |
+
"loss": 8.0629,
|
| 17058 |
+
"step": 2408
|
| 17059 |
+
},
|
| 17060 |
+
{
|
| 17061 |
+
"epoch": 0.10404249805649132,
|
| 17062 |
+
"grad_norm": 0.64453125,
|
| 17063 |
+
"learning_rate": 0.000992052720187474,
|
| 17064 |
+
"loss": 8.1571,
|
| 17065 |
+
"step": 2409
|
| 17066 |
+
},
|
| 17067 |
+
{
|
| 17068 |
+
"epoch": 0.10408568713829144,
|
| 17069 |
+
"grad_norm": 0.46484375,
|
| 17070 |
+
"learning_rate": 0.0009920400333135926,
|
| 17071 |
+
"loss": 8.3833,
|
| 17072 |
+
"step": 2410
|
| 17073 |
+
},
|
| 17074 |
+
{
|
| 17075 |
+
"epoch": 0.10412887622009157,
|
| 17076 |
+
"grad_norm": 0.65234375,
|
| 17077 |
+
"learning_rate": 0.000992027336402507,
|
| 17078 |
+
"loss": 8.6215,
|
| 17079 |
+
"step": 2411
|
| 17080 |
+
},
|
| 17081 |
+
{
|
| 17082 |
+
"epoch": 0.10417206530189169,
|
| 17083 |
+
"grad_norm": 0.63671875,
|
| 17084 |
+
"learning_rate": 0.0009920146294544762,
|
| 17085 |
+
"loss": 8.2631,
|
| 17086 |
+
"step": 2412
|
| 17087 |
+
},
|
| 17088 |
+
{
|
| 17089 |
+
"epoch": 0.10421525438369181,
|
| 17090 |
+
"grad_norm": 0.765625,
|
| 17091 |
+
"learning_rate": 0.0009920019124697592,
|
| 17092 |
+
"loss": 8.0769,
|
| 17093 |
+
"step": 2413
|
| 17094 |
+
},
|
| 17095 |
+
{
|
| 17096 |
+
"epoch": 0.10425844346549193,
|
| 17097 |
+
"grad_norm": 0.75,
|
| 17098 |
+
"learning_rate": 0.0009919891854486159,
|
| 17099 |
+
"loss": 8.5708,
|
| 17100 |
+
"step": 2414
|
| 17101 |
+
},
|
| 17102 |
+
{
|
| 17103 |
+
"epoch": 0.10430163254729205,
|
| 17104 |
+
"grad_norm": 0.6171875,
|
| 17105 |
+
"learning_rate": 0.000991976448391305,
|
| 17106 |
+
"loss": 7.9082,
|
| 17107 |
+
"step": 2415
|
| 17108 |
+
},
|
| 17109 |
+
{
|
| 17110 |
+
"epoch": 0.10434482162909217,
|
| 17111 |
+
"grad_norm": 0.57421875,
|
| 17112 |
+
"learning_rate": 0.0009919637012980875,
|
| 17113 |
+
"loss": 8.4871,
|
| 17114 |
+
"step": 2416
|
| 17115 |
+
},
|
| 17116 |
+
{
|
| 17117 |
+
"epoch": 0.10438801071089229,
|
| 17118 |
+
"grad_norm": 0.57421875,
|
| 17119 |
+
"learning_rate": 0.0009919509441692227,
|
| 17120 |
+
"loss": 8.658,
|
| 17121 |
+
"step": 2417
|
| 17122 |
+
},
|
| 17123 |
+
{
|
| 17124 |
+
"epoch": 0.10443119979269241,
|
| 17125 |
+
"grad_norm": 0.640625,
|
| 17126 |
+
"learning_rate": 0.000991938177004971,
|
| 17127 |
+
"loss": 8.1744,
|
| 17128 |
+
"step": 2418
|
| 17129 |
+
},
|
| 17130 |
+
{
|
| 17131 |
+
"epoch": 0.10447438887449252,
|
| 17132 |
+
"grad_norm": 0.67578125,
|
| 17133 |
+
"learning_rate": 0.0009919253998055928,
|
| 17134 |
+
"loss": 8.255,
|
| 17135 |
+
"step": 2419
|
| 17136 |
+
},
|
| 17137 |
+
{
|
| 17138 |
+
"epoch": 0.10451757795629264,
|
| 17139 |
+
"grad_norm": 0.69140625,
|
| 17140 |
+
"learning_rate": 0.0009919126125713489,
|
| 17141 |
+
"loss": 8.3867,
|
| 17142 |
+
"step": 2420
|
| 17143 |
+
},
|
| 17144 |
+
{
|
| 17145 |
+
"epoch": 0.10456076703809276,
|
| 17146 |
+
"grad_norm": 0.68359375,
|
| 17147 |
+
"learning_rate": 0.0009918998153024999,
|
| 17148 |
+
"loss": 8.0586,
|
| 17149 |
+
"step": 2421
|
| 17150 |
+
},
|
| 17151 |
+
{
|
| 17152 |
+
"epoch": 0.10460395611989289,
|
| 17153 |
+
"grad_norm": 0.61328125,
|
| 17154 |
+
"learning_rate": 0.0009918870079993068,
|
| 17155 |
+
"loss": 8.2327,
|
| 17156 |
+
"step": 2422
|
| 17157 |
+
},
|
| 17158 |
+
{
|
| 17159 |
+
"epoch": 0.104647145201693,
|
| 17160 |
+
"grad_norm": 0.84765625,
|
| 17161 |
+
"learning_rate": 0.0009918741906620313,
|
| 17162 |
+
"loss": 8.5985,
|
| 17163 |
+
"step": 2423
|
| 17164 |
+
},
|
| 17165 |
+
{
|
| 17166 |
+
"epoch": 0.10469033428349313,
|
| 17167 |
+
"grad_norm": 3.375,
|
| 17168 |
+
"learning_rate": 0.0009918613632909346,
|
| 17169 |
+
"loss": 9.423,
|
| 17170 |
+
"step": 2424
|
| 17171 |
+
},
|
| 17172 |
+
{
|
| 17173 |
+
"epoch": 0.10473352336529325,
|
| 17174 |
+
"grad_norm": 0.6484375,
|
| 17175 |
+
"learning_rate": 0.0009918485258862781,
|
| 17176 |
+
"loss": 8.1833,
|
| 17177 |
+
"step": 2425
|
| 17178 |
+
},
|
| 17179 |
+
{
|
| 17180 |
+
"epoch": 0.10477671244709337,
|
| 17181 |
+
"grad_norm": 0.4375,
|
| 17182 |
+
"learning_rate": 0.0009918356784483242,
|
| 17183 |
+
"loss": 8.5462,
|
| 17184 |
+
"step": 2426
|
| 17185 |
+
},
|
| 17186 |
+
{
|
| 17187 |
+
"epoch": 0.10481990152889349,
|
| 17188 |
+
"grad_norm": 0.60546875,
|
| 17189 |
+
"learning_rate": 0.0009918228209773346,
|
| 17190 |
+
"loss": 8.5186,
|
| 17191 |
+
"step": 2427
|
| 17192 |
+
},
|
| 17193 |
+
{
|
| 17194 |
+
"epoch": 0.10486309061069361,
|
| 17195 |
+
"grad_norm": 0.498046875,
|
| 17196 |
+
"learning_rate": 0.0009918099534735718,
|
| 17197 |
+
"loss": 8.562,
|
| 17198 |
+
"step": 2428
|
| 17199 |
+
},
|
| 17200 |
+
{
|
| 17201 |
+
"epoch": 0.10490627969249373,
|
| 17202 |
+
"grad_norm": 0.64453125,
|
| 17203 |
+
"learning_rate": 0.000991797075937298,
|
| 17204 |
+
"loss": 8.1776,
|
| 17205 |
+
"step": 2429
|
| 17206 |
+
},
|
| 17207 |
+
{
|
| 17208 |
+
"epoch": 0.10494946877429386,
|
| 17209 |
+
"grad_norm": 0.62890625,
|
| 17210 |
+
"learning_rate": 0.0009917841883687764,
|
| 17211 |
+
"loss": 8.3763,
|
| 17212 |
+
"step": 2430
|
| 17213 |
+
},
|
| 17214 |
+
{
|
| 17215 |
+
"epoch": 0.10499265785609398,
|
| 17216 |
+
"grad_norm": 0.6015625,
|
| 17217 |
+
"learning_rate": 0.0009917712907682693,
|
| 17218 |
+
"loss": 8.2266,
|
| 17219 |
+
"step": 2431
|
| 17220 |
+
},
|
| 17221 |
+
{
|
| 17222 |
+
"epoch": 0.1050358469378941,
|
| 17223 |
+
"grad_norm": 0.5,
|
| 17224 |
+
"learning_rate": 0.0009917583831360402,
|
| 17225 |
+
"loss": 8.4739,
|
| 17226 |
+
"step": 2432
|
| 17227 |
+
},
|
| 17228 |
+
{
|
| 17229 |
+
"epoch": 0.10507903601969422,
|
| 17230 |
+
"grad_norm": 0.859375,
|
| 17231 |
+
"learning_rate": 0.0009917454654723523,
|
| 17232 |
+
"loss": 8.3661,
|
| 17233 |
+
"step": 2433
|
| 17234 |
+
},
|
| 17235 |
+
{
|
| 17236 |
+
"epoch": 0.10512222510149434,
|
| 17237 |
+
"grad_norm": 0.57421875,
|
| 17238 |
+
"learning_rate": 0.0009917325377774688,
|
| 17239 |
+
"loss": 8.2533,
|
| 17240 |
+
"step": 2434
|
| 17241 |
+
},
|
| 17242 |
+
{
|
| 17243 |
+
"epoch": 0.10516541418329446,
|
| 17244 |
+
"grad_norm": 0.6171875,
|
| 17245 |
+
"learning_rate": 0.000991719600051654,
|
| 17246 |
+
"loss": 8.4593,
|
| 17247 |
+
"step": 2435
|
| 17248 |
+
},
|
| 17249 |
+
{
|
| 17250 |
+
"epoch": 0.10520860326509458,
|
| 17251 |
+
"grad_norm": 0.478515625,
|
| 17252 |
+
"learning_rate": 0.0009917066522951714,
|
| 17253 |
+
"loss": 8.2202,
|
| 17254 |
+
"step": 2436
|
| 17255 |
+
},
|
| 17256 |
+
{
|
| 17257 |
+
"epoch": 0.1052517923468947,
|
| 17258 |
+
"grad_norm": 0.734375,
|
| 17259 |
+
"learning_rate": 0.0009916936945082854,
|
| 17260 |
+
"loss": 8.0726,
|
| 17261 |
+
"step": 2437
|
| 17262 |
+
},
|
| 17263 |
+
{
|
| 17264 |
+
"epoch": 0.10529498142869483,
|
| 17265 |
+
"grad_norm": 0.42578125,
|
| 17266 |
+
"learning_rate": 0.00099168072669126,
|
| 17267 |
+
"loss": 8.3176,
|
| 17268 |
+
"step": 2438
|
| 17269 |
+
},
|
| 17270 |
+
{
|
| 17271 |
+
"epoch": 0.10533817051049495,
|
| 17272 |
+
"grad_norm": 0.65234375,
|
| 17273 |
+
"learning_rate": 0.00099166774884436,
|
| 17274 |
+
"loss": 8.4613,
|
| 17275 |
+
"step": 2439
|
| 17276 |
+
},
|
| 17277 |
+
{
|
| 17278 |
+
"epoch": 0.10538135959229507,
|
| 17279 |
+
"grad_norm": 1.3046875,
|
| 17280 |
+
"learning_rate": 0.00099165476096785,
|
| 17281 |
+
"loss": 8.4511,
|
| 17282 |
+
"step": 2440
|
| 17283 |
+
},
|
| 17284 |
+
{
|
| 17285 |
+
"epoch": 0.10542454867409519,
|
| 17286 |
+
"grad_norm": 0.58203125,
|
| 17287 |
+
"learning_rate": 0.000991641763061995,
|
| 17288 |
+
"loss": 8.1388,
|
| 17289 |
+
"step": 2441
|
| 17290 |
+
},
|
| 17291 |
+
{
|
| 17292 |
+
"epoch": 0.10546773775589531,
|
| 17293 |
+
"grad_norm": 0.51171875,
|
| 17294 |
+
"learning_rate": 0.0009916287551270599,
|
| 17295 |
+
"loss": 8.254,
|
| 17296 |
+
"step": 2442
|
| 17297 |
+
},
|
| 17298 |
+
{
|
| 17299 |
+
"epoch": 0.10551092683769543,
|
| 17300 |
+
"grad_norm": 0.5625,
|
| 17301 |
+
"learning_rate": 0.0009916157371633106,
|
| 17302 |
+
"loss": 8.3485,
|
| 17303 |
+
"step": 2443
|
| 17304 |
+
},
|
| 17305 |
+
{
|
| 17306 |
+
"epoch": 0.10555411591949555,
|
| 17307 |
+
"grad_norm": 0.474609375,
|
| 17308 |
+
"learning_rate": 0.0009916027091710123,
|
| 17309 |
+
"loss": 8.303,
|
| 17310 |
+
"step": 2444
|
| 17311 |
+
},
|
| 17312 |
+
{
|
| 17313 |
+
"epoch": 0.10559730500129567,
|
| 17314 |
+
"grad_norm": 0.55078125,
|
| 17315 |
+
"learning_rate": 0.0009915896711504306,
|
| 17316 |
+
"loss": 8.7418,
|
| 17317 |
+
"step": 2445
|
| 17318 |
+
},
|
| 17319 |
+
{
|
| 17320 |
+
"epoch": 0.1056404940830958,
|
| 17321 |
+
"grad_norm": 0.578125,
|
| 17322 |
+
"learning_rate": 0.0009915766231018317,
|
| 17323 |
+
"loss": 8.2505,
|
| 17324 |
+
"step": 2446
|
| 17325 |
+
},
|
| 17326 |
+
{
|
| 17327 |
+
"epoch": 0.10568368316489592,
|
| 17328 |
+
"grad_norm": 0.7578125,
|
| 17329 |
+
"learning_rate": 0.000991563565025482,
|
| 17330 |
+
"loss": 8.5745,
|
| 17331 |
+
"step": 2447
|
| 17332 |
+
},
|
| 17333 |
+
{
|
| 17334 |
+
"epoch": 0.10572687224669604,
|
| 17335 |
+
"grad_norm": 0.54296875,
|
| 17336 |
+
"learning_rate": 0.0009915504969216472,
|
| 17337 |
+
"loss": 8.242,
|
| 17338 |
+
"step": 2448
|
| 17339 |
+
},
|
| 17340 |
+
{
|
| 17341 |
+
"epoch": 0.10577006132849616,
|
| 17342 |
+
"grad_norm": 0.58203125,
|
| 17343 |
+
"learning_rate": 0.0009915374187905945,
|
| 17344 |
+
"loss": 8.4203,
|
| 17345 |
+
"step": 2449
|
| 17346 |
+
},
|
| 17347 |
+
{
|
| 17348 |
+
"epoch": 0.10581325041029628,
|
| 17349 |
+
"grad_norm": 0.5390625,
|
| 17350 |
+
"learning_rate": 0.0009915243306325905,
|
| 17351 |
+
"loss": 8.1327,
|
| 17352 |
+
"step": 2450
|
| 17353 |
+
},
|
| 17354 |
+
{
|
| 17355 |
+
"epoch": 0.1058564394920964,
|
| 17356 |
+
"grad_norm": 0.65234375,
|
| 17357 |
+
"learning_rate": 0.0009915112324479021,
|
| 17358 |
+
"loss": 8.4274,
|
| 17359 |
+
"step": 2451
|
| 17360 |
+
},
|
| 17361 |
+
{
|
| 17362 |
+
"epoch": 0.10589962857389652,
|
| 17363 |
+
"grad_norm": 0.62890625,
|
| 17364 |
+
"learning_rate": 0.0009914981242367966,
|
| 17365 |
+
"loss": 8.141,
|
| 17366 |
+
"step": 2452
|
| 17367 |
+
},
|
| 17368 |
+
{
|
| 17369 |
+
"epoch": 0.10594281765569664,
|
| 17370 |
+
"grad_norm": 0.609375,
|
| 17371 |
+
"learning_rate": 0.0009914850059995412,
|
| 17372 |
+
"loss": 8.222,
|
| 17373 |
+
"step": 2453
|
| 17374 |
+
},
|
| 17375 |
+
{
|
| 17376 |
+
"epoch": 0.10598600673749677,
|
| 17377 |
+
"grad_norm": 0.447265625,
|
| 17378 |
+
"learning_rate": 0.0009914718777364038,
|
| 17379 |
+
"loss": 8.2166,
|
| 17380 |
+
"step": 2454
|
| 17381 |
+
},
|
| 17382 |
+
{
|
| 17383 |
+
"epoch": 0.10602919581929689,
|
| 17384 |
+
"grad_norm": 0.412109375,
|
| 17385 |
+
"learning_rate": 0.000991458739447652,
|
| 17386 |
+
"loss": 8.2447,
|
| 17387 |
+
"step": 2455
|
| 17388 |
+
},
|
| 17389 |
+
{
|
| 17390 |
+
"epoch": 0.10607238490109701,
|
| 17391 |
+
"grad_norm": 0.60546875,
|
| 17392 |
+
"learning_rate": 0.0009914455911335537,
|
| 17393 |
+
"loss": 8.471,
|
| 17394 |
+
"step": 2456
|
| 17395 |
+
},
|
| 17396 |
+
{
|
| 17397 |
+
"epoch": 0.10611557398289713,
|
| 17398 |
+
"grad_norm": 0.55078125,
|
| 17399 |
+
"learning_rate": 0.0009914324327943774,
|
| 17400 |
+
"loss": 8.376,
|
| 17401 |
+
"step": 2457
|
| 17402 |
+
},
|
| 17403 |
+
{
|
| 17404 |
+
"epoch": 0.10615876306469725,
|
| 17405 |
+
"grad_norm": 0.78515625,
|
| 17406 |
+
"learning_rate": 0.0009914192644303915,
|
| 17407 |
+
"loss": 8.4581,
|
| 17408 |
+
"step": 2458
|
| 17409 |
+
},
|
| 17410 |
+
{
|
| 17411 |
+
"epoch": 0.10620195214649737,
|
| 17412 |
+
"grad_norm": 0.421875,
|
| 17413 |
+
"learning_rate": 0.0009914060860418644,
|
| 17414 |
+
"loss": 8.6168,
|
| 17415 |
+
"step": 2459
|
| 17416 |
+
},
|
| 17417 |
+
{
|
| 17418 |
+
"epoch": 0.1062451412282975,
|
| 17419 |
+
"grad_norm": 0.51953125,
|
| 17420 |
+
"learning_rate": 0.0009913928976290648,
|
| 17421 |
+
"loss": 8.2619,
|
| 17422 |
+
"step": 2460
|
| 17423 |
+
},
|
| 17424 |
+
{
|
| 17425 |
+
"epoch": 0.1062883303100976,
|
| 17426 |
+
"grad_norm": 0.443359375,
|
| 17427 |
+
"learning_rate": 0.0009913796991922624,
|
| 17428 |
+
"loss": 8.5319,
|
| 17429 |
+
"step": 2461
|
| 17430 |
+
},
|
| 17431 |
+
{
|
| 17432 |
+
"epoch": 0.10633151939189772,
|
| 17433 |
+
"grad_norm": 0.5625,
|
| 17434 |
+
"learning_rate": 0.000991366490731726,
|
| 17435 |
+
"loss": 8.2694,
|
| 17436 |
+
"step": 2462
|
| 17437 |
+
},
|
| 17438 |
+
{
|
| 17439 |
+
"epoch": 0.10637470847369784,
|
| 17440 |
+
"grad_norm": 0.478515625,
|
| 17441 |
+
"learning_rate": 0.0009913532722477247,
|
| 17442 |
+
"loss": 8.5479,
|
| 17443 |
+
"step": 2463
|
| 17444 |
+
},
|
| 17445 |
+
{
|
| 17446 |
+
"epoch": 0.10641789755549796,
|
| 17447 |
+
"grad_norm": 0.4765625,
|
| 17448 |
+
"learning_rate": 0.0009913400437405286,
|
| 17449 |
+
"loss": 8.3825,
|
| 17450 |
+
"step": 2464
|
| 17451 |
+
},
|
| 17452 |
+
{
|
| 17453 |
+
"epoch": 0.10646108663729809,
|
| 17454 |
+
"grad_norm": 0.62890625,
|
| 17455 |
+
"learning_rate": 0.0009913268052104077,
|
| 17456 |
+
"loss": 8.3031,
|
| 17457 |
+
"step": 2465
|
| 17458 |
+
},
|
| 17459 |
+
{
|
| 17460 |
+
"epoch": 0.10650427571909821,
|
| 17461 |
+
"grad_norm": 0.408203125,
|
| 17462 |
+
"learning_rate": 0.0009913135566576314,
|
| 17463 |
+
"loss": 8.3329,
|
| 17464 |
+
"step": 2466
|
| 17465 |
+
},
|
| 17466 |
+
{
|
| 17467 |
+
"epoch": 0.10654746480089833,
|
| 17468 |
+
"grad_norm": 0.431640625,
|
| 17469 |
+
"learning_rate": 0.000991300298082471,
|
| 17470 |
+
"loss": 8.1919,
|
| 17471 |
+
"step": 2467
|
| 17472 |
+
},
|
| 17473 |
+
{
|
| 17474 |
+
"epoch": 0.10659065388269845,
|
| 17475 |
+
"grad_norm": 0.703125,
|
| 17476 |
+
"learning_rate": 0.0009912870294851957,
|
| 17477 |
+
"loss": 8.265,
|
| 17478 |
+
"step": 2468
|
| 17479 |
+
},
|
| 17480 |
+
{
|
| 17481 |
+
"epoch": 0.10663384296449857,
|
| 17482 |
+
"grad_norm": 0.54296875,
|
| 17483 |
+
"learning_rate": 0.000991273750866077,
|
| 17484 |
+
"loss": 8.1909,
|
| 17485 |
+
"step": 2469
|
| 17486 |
+
},
|
| 17487 |
+
{
|
| 17488 |
+
"epoch": 0.10667703204629869,
|
| 17489 |
+
"grad_norm": 0.64453125,
|
| 17490 |
+
"learning_rate": 0.0009912604622253857,
|
| 17491 |
+
"loss": 8.4647,
|
| 17492 |
+
"step": 2470
|
| 17493 |
+
},
|
| 17494 |
+
{
|
| 17495 |
+
"epoch": 0.10672022112809881,
|
| 17496 |
+
"grad_norm": 0.90234375,
|
| 17497 |
+
"learning_rate": 0.0009912471635633924,
|
| 17498 |
+
"loss": 8.006,
|
| 17499 |
+
"step": 2471
|
| 17500 |
+
},
|
| 17501 |
+
{
|
| 17502 |
+
"epoch": 0.10676341020989893,
|
| 17503 |
+
"grad_norm": 0.69921875,
|
| 17504 |
+
"learning_rate": 0.000991233854880369,
|
| 17505 |
+
"loss": 8.237,
|
| 17506 |
+
"step": 2472
|
| 17507 |
+
},
|
| 17508 |
+
{
|
| 17509 |
+
"epoch": 0.10680659929169906,
|
| 17510 |
+
"grad_norm": 0.51953125,
|
| 17511 |
+
"learning_rate": 0.0009912205361765868,
|
| 17512 |
+
"loss": 8.4347,
|
| 17513 |
+
"step": 2473
|
| 17514 |
+
},
|
| 17515 |
+
{
|
| 17516 |
+
"epoch": 0.10684978837349918,
|
| 17517 |
+
"grad_norm": 0.466796875,
|
| 17518 |
+
"learning_rate": 0.0009912072074523173,
|
| 17519 |
+
"loss": 8.6241,
|
| 17520 |
+
"step": 2474
|
| 17521 |
+
},
|
| 17522 |
+
{
|
| 17523 |
+
"epoch": 0.1068929774552993,
|
| 17524 |
+
"grad_norm": 0.470703125,
|
| 17525 |
+
"learning_rate": 0.0009911938687078323,
|
| 17526 |
+
"loss": 8.4256,
|
| 17527 |
+
"step": 2475
|
| 17528 |
+
},
|
| 17529 |
+
{
|
| 17530 |
+
"epoch": 0.10693616653709942,
|
| 17531 |
+
"grad_norm": 0.79296875,
|
| 17532 |
+
"learning_rate": 0.0009911805199434044,
|
| 17533 |
+
"loss": 8.4895,
|
| 17534 |
+
"step": 2476
|
| 17535 |
+
},
|
| 17536 |
+
{
|
| 17537 |
+
"epoch": 0.10697935561889954,
|
| 17538 |
+
"grad_norm": 0.59375,
|
| 17539 |
+
"learning_rate": 0.0009911671611593056,
|
| 17540 |
+
"loss": 8.3152,
|
| 17541 |
+
"step": 2477
|
| 17542 |
+
},
|
| 17543 |
+
{
|
| 17544 |
+
"epoch": 0.10702254470069966,
|
| 17545 |
+
"grad_norm": 0.5234375,
|
| 17546 |
+
"learning_rate": 0.0009911537923558082,
|
| 17547 |
+
"loss": 8.2683,
|
| 17548 |
+
"step": 2478
|
| 17549 |
+
},
|
| 17550 |
+
{
|
| 17551 |
+
"epoch": 0.10706573378249978,
|
| 17552 |
+
"grad_norm": 0.53515625,
|
| 17553 |
+
"learning_rate": 0.0009911404135331853,
|
| 17554 |
+
"loss": 8.4738,
|
| 17555 |
+
"step": 2479
|
| 17556 |
+
},
|
| 17557 |
+
{
|
| 17558 |
+
"epoch": 0.1071089228642999,
|
| 17559 |
+
"grad_norm": 0.462890625,
|
| 17560 |
+
"learning_rate": 0.0009911270246917095,
|
| 17561 |
+
"loss": 8.2726,
|
| 17562 |
+
"step": 2480
|
| 17563 |
+
},
|
| 17564 |
+
{
|
| 17565 |
+
"epoch": 0.10715211194610003,
|
| 17566 |
+
"grad_norm": 0.447265625,
|
| 17567 |
+
"learning_rate": 0.000991113625831654,
|
| 17568 |
+
"loss": 8.3151,
|
| 17569 |
+
"step": 2481
|
| 17570 |
+
},
|
| 17571 |
+
{
|
| 17572 |
+
"epoch": 0.10719530102790015,
|
| 17573 |
+
"grad_norm": 0.451171875,
|
| 17574 |
+
"learning_rate": 0.0009911002169532923,
|
| 17575 |
+
"loss": 8.4751,
|
| 17576 |
+
"step": 2482
|
| 17577 |
+
},
|
| 17578 |
+
{
|
| 17579 |
+
"epoch": 0.10723849010970027,
|
| 17580 |
+
"grad_norm": 0.578125,
|
| 17581 |
+
"learning_rate": 0.0009910867980568979,
|
| 17582 |
+
"loss": 8.5141,
|
| 17583 |
+
"step": 2483
|
| 17584 |
+
},
|
| 17585 |
+
{
|
| 17586 |
+
"epoch": 0.10728167919150039,
|
| 17587 |
+
"grad_norm": 0.61328125,
|
| 17588 |
+
"learning_rate": 0.0009910733691427442,
|
| 17589 |
+
"loss": 8.4462,
|
| 17590 |
+
"step": 2484
|
| 17591 |
+
},
|
| 17592 |
+
{
|
| 17593 |
+
"epoch": 0.10732486827330051,
|
| 17594 |
+
"grad_norm": 0.5546875,
|
| 17595 |
+
"learning_rate": 0.0009910599302111057,
|
| 17596 |
+
"loss": 8.232,
|
| 17597 |
+
"step": 2485
|
| 17598 |
+
},
|
| 17599 |
+
{
|
| 17600 |
+
"epoch": 0.10736805735510063,
|
| 17601 |
+
"grad_norm": 0.58203125,
|
| 17602 |
+
"learning_rate": 0.000991046481262256,
|
| 17603 |
+
"loss": 8.1304,
|
| 17604 |
+
"step": 2486
|
| 17605 |
+
},
|
| 17606 |
+
{
|
| 17607 |
+
"epoch": 0.10741124643690075,
|
| 17608 |
+
"grad_norm": 0.44140625,
|
| 17609 |
+
"learning_rate": 0.00099103302229647,
|
| 17610 |
+
"loss": 8.3622,
|
| 17611 |
+
"step": 2487
|
| 17612 |
+
},
|
| 17613 |
+
{
|
| 17614 |
+
"epoch": 0.10745443551870087,
|
| 17615 |
+
"grad_norm": 0.51171875,
|
| 17616 |
+
"learning_rate": 0.0009910195533140214,
|
| 17617 |
+
"loss": 8.3612,
|
| 17618 |
+
"step": 2488
|
| 17619 |
+
},
|
| 17620 |
+
{
|
| 17621 |
+
"epoch": 0.107497624600501,
|
| 17622 |
+
"grad_norm": 0.484375,
|
| 17623 |
+
"learning_rate": 0.000991006074315186,
|
| 17624 |
+
"loss": 8.3691,
|
| 17625 |
+
"step": 2489
|
| 17626 |
+
},
|
| 17627 |
+
{
|
| 17628 |
+
"epoch": 0.10754081368230112,
|
| 17629 |
+
"grad_norm": 0.55859375,
|
| 17630 |
+
"learning_rate": 0.000990992585300238,
|
| 17631 |
+
"loss": 8.752,
|
| 17632 |
+
"step": 2490
|
| 17633 |
+
},
|
| 17634 |
+
{
|
| 17635 |
+
"epoch": 0.10758400276410124,
|
| 17636 |
+
"grad_norm": 0.54296875,
|
| 17637 |
+
"learning_rate": 0.0009909790862694529,
|
| 17638 |
+
"loss": 8.3788,
|
| 17639 |
+
"step": 2491
|
| 17640 |
+
},
|
| 17641 |
+
{
|
| 17642 |
+
"epoch": 0.10762719184590136,
|
| 17643 |
+
"grad_norm": 0.423828125,
|
| 17644 |
+
"learning_rate": 0.0009909655772231056,
|
| 17645 |
+
"loss": 8.378,
|
| 17646 |
+
"step": 2492
|
| 17647 |
+
},
|
| 17648 |
+
{
|
| 17649 |
+
"epoch": 0.10767038092770148,
|
| 17650 |
+
"grad_norm": 0.58203125,
|
| 17651 |
+
"learning_rate": 0.0009909520581614724,
|
| 17652 |
+
"loss": 8.2585,
|
| 17653 |
+
"step": 2493
|
| 17654 |
+
},
|
| 17655 |
+
{
|
| 17656 |
+
"epoch": 0.1077135700095016,
|
| 17657 |
+
"grad_norm": 0.453125,
|
| 17658 |
+
"learning_rate": 0.0009909385290848287,
|
| 17659 |
+
"loss": 8.3038,
|
| 17660 |
+
"step": 2494
|
| 17661 |
+
},
|
| 17662 |
+
{
|
| 17663 |
+
"epoch": 0.10775675909130172,
|
| 17664 |
+
"grad_norm": 0.466796875,
|
| 17665 |
+
"learning_rate": 0.0009909249899934505,
|
| 17666 |
+
"loss": 8.5207,
|
| 17667 |
+
"step": 2495
|
| 17668 |
+
},
|
| 17669 |
+
{
|
| 17670 |
+
"epoch": 0.10779994817310184,
|
| 17671 |
+
"grad_norm": 0.6796875,
|
| 17672 |
+
"learning_rate": 0.000990911440887614,
|
| 17673 |
+
"loss": 7.9057,
|
| 17674 |
+
"step": 2496
|
| 17675 |
+
},
|
| 17676 |
+
{
|
| 17677 |
+
"epoch": 0.10784313725490197,
|
| 17678 |
+
"grad_norm": 0.74609375,
|
| 17679 |
+
"learning_rate": 0.0009908978817675957,
|
| 17680 |
+
"loss": 8.4006,
|
| 17681 |
+
"step": 2497
|
| 17682 |
+
},
|
| 17683 |
+
{
|
| 17684 |
+
"epoch": 0.10788632633670209,
|
| 17685 |
+
"grad_norm": 0.455078125,
|
| 17686 |
+
"learning_rate": 0.000990884312633672,
|
| 17687 |
+
"loss": 8.3619,
|
| 17688 |
+
"step": 2498
|
| 17689 |
+
},
|
| 17690 |
+
{
|
| 17691 |
+
"epoch": 0.10792951541850221,
|
| 17692 |
+
"grad_norm": 0.53515625,
|
| 17693 |
+
"learning_rate": 0.0009908707334861197,
|
| 17694 |
+
"loss": 8.5389,
|
| 17695 |
+
"step": 2499
|
| 17696 |
+
},
|
| 17697 |
+
{
|
| 17698 |
+
"epoch": 0.10797270450030233,
|
| 17699 |
+
"grad_norm": 0.65625,
|
| 17700 |
+
"learning_rate": 0.000990857144325216,
|
| 17701 |
+
"loss": 8.3051,
|
| 17702 |
+
"step": 2500
|
| 17703 |
+
},
|
| 17704 |
+
{
|
| 17705 |
+
"epoch": 0.10797270450030233,
|
| 17706 |
+
"eval_loss": 8.355310440063477,
|
| 17707 |
+
"eval_runtime": 21.5458,
|
| 17708 |
+
"eval_samples_per_second": 1.114,
|
| 17709 |
+
"eval_steps_per_second": 0.139,
|
| 17710 |
+
"step": 2500
|
| 17711 |
}
|
| 17712 |
],
|
| 17713 |
"logging_steps": 1,
|
|
|
|
| 17727 |
"attributes": {}
|
| 17728 |
}
|
| 17729 |
},
|
| 17730 |
+
"total_flos": 7991377920000.0,
|
| 17731 |
"train_batch_size": 1,
|
| 17732 |
"trial_name": null,
|
| 17733 |
"trial_params": null
|