Training in progress, step 18000, checkpoint
Browse files- last-checkpoint/optimizer.pt +1 -1
- last-checkpoint/pytorch_model.bin +1 -1
- last-checkpoint/rng_state_0.pth +1 -1
- last-checkpoint/rng_state_1.pth +1 -1
- last-checkpoint/rng_state_2.pth +1 -1
- last-checkpoint/rng_state_3.pth +1 -1
- last-checkpoint/scheduler.pt +1 -1
- last-checkpoint/trainer_state.json +353 -3
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 304481530
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:13fbe4723123a9c016392f22f5c5a607f137024e3a3211fa73da181d0f6cd1aa
|
| 3 |
size 304481530
|
last-checkpoint/pytorch_model.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 402029570
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3aa6efd41ace1816d77bf0b60c121855a1169e94c3066ee2c4a8939be056cb68
|
| 3 |
size 402029570
|
last-checkpoint/rng_state_0.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:98e45d3c16114f00517a9e754366d6be11045def442e0374684988d3ee13c529
|
| 3 |
size 14960
|
last-checkpoint/rng_state_1.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:773184c6d03f9fc1dff724dd2ebc3487575db231883b47dc4663fdc68f33bddb
|
| 3 |
size 14960
|
last-checkpoint/rng_state_2.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f9a97caacfd2ffecaa53d612d1aaec198c719ff4db983e8469e19a70730a6af9
|
| 3 |
size 14960
|
last-checkpoint/rng_state_3.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ee307f509a475bceeb88f57a12c9dbe31c5cc43a16b915e7c00fca8b909b56f5
|
| 3 |
size 14960
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:add33ce1c647f1ad24436fdd2c7095ade5081fad618777000690c7e187278b49
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -5958,6 +5958,356 @@
|
|
| 5958 |
"learning_rate": 0.000494636149601328,
|
| 5959 |
"loss": 20.0712,
|
| 5960 |
"step": 17000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5961 |
}
|
| 5962 |
],
|
| 5963 |
"logging_steps": 20,
|
|
@@ -5977,7 +6327,7 @@
|
|
| 5977 |
"attributes": {}
|
| 5978 |
}
|
| 5979 |
},
|
| 5980 |
-
"total_flos": 1.
|
| 5981 |
"train_batch_size": 48,
|
| 5982 |
"trial_name": null,
|
| 5983 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.035099521769015894,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 18000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 5958 |
"learning_rate": 0.000494636149601328,
|
| 5959 |
"loss": 20.0712,
|
| 5960 |
"step": 17000
|
| 5961 |
+
},
|
| 5962 |
+
{
|
| 5963 |
+
"epoch": 0.03318854780603614,
|
| 5964 |
+
"grad_norm": 11.0,
|
| 5965 |
+
"learning_rate": 0.0004946296475704186,
|
| 5966 |
+
"loss": 20.1068,
|
| 5967 |
+
"step": 17020
|
| 5968 |
+
},
|
| 5969 |
+
{
|
| 5970 |
+
"epoch": 0.03322754727466838,
|
| 5971 |
+
"grad_norm": 10.1875,
|
| 5972 |
+
"learning_rate": 0.0004946231455395093,
|
| 5973 |
+
"loss": 20.0552,
|
| 5974 |
+
"step": 17040
|
| 5975 |
+
},
|
| 5976 |
+
{
|
| 5977 |
+
"epoch": 0.03326654674330062,
|
| 5978 |
+
"grad_norm": 12.5,
|
| 5979 |
+
"learning_rate": 0.0004946166435085999,
|
| 5980 |
+
"loss": 20.0382,
|
| 5981 |
+
"step": 17060
|
| 5982 |
+
},
|
| 5983 |
+
{
|
| 5984 |
+
"epoch": 0.033305546211932865,
|
| 5985 |
+
"grad_norm": 11.125,
|
| 5986 |
+
"learning_rate": 0.0004946101414776906,
|
| 5987 |
+
"loss": 20.1285,
|
| 5988 |
+
"step": 17080
|
| 5989 |
+
},
|
| 5990 |
+
{
|
| 5991 |
+
"epoch": 0.0333445456805651,
|
| 5992 |
+
"grad_norm": 10.375,
|
| 5993 |
+
"learning_rate": 0.0004946036394467813,
|
| 5994 |
+
"loss": 20.0373,
|
| 5995 |
+
"step": 17100
|
| 5996 |
+
},
|
| 5997 |
+
{
|
| 5998 |
+
"epoch": 0.03338354514919734,
|
| 5999 |
+
"grad_norm": 9.9375,
|
| 6000 |
+
"learning_rate": 0.0004945971374158719,
|
| 6001 |
+
"loss": 20.1946,
|
| 6002 |
+
"step": 17120
|
| 6003 |
+
},
|
| 6004 |
+
{
|
| 6005 |
+
"epoch": 0.033422544617829585,
|
| 6006 |
+
"grad_norm": 10.6875,
|
| 6007 |
+
"learning_rate": 0.0004945906353849625,
|
| 6008 |
+
"loss": 20.1412,
|
| 6009 |
+
"step": 17140
|
| 6010 |
+
},
|
| 6011 |
+
{
|
| 6012 |
+
"epoch": 0.03346154408646182,
|
| 6013 |
+
"grad_norm": 9.5,
|
| 6014 |
+
"learning_rate": 0.0004945841333540531,
|
| 6015 |
+
"loss": 20.078,
|
| 6016 |
+
"step": 17160
|
| 6017 |
+
},
|
| 6018 |
+
{
|
| 6019 |
+
"epoch": 0.03350054355509406,
|
| 6020 |
+
"grad_norm": 10.5,
|
| 6021 |
+
"learning_rate": 0.0004945776313231438,
|
| 6022 |
+
"loss": 20.0913,
|
| 6023 |
+
"step": 17180
|
| 6024 |
+
},
|
| 6025 |
+
{
|
| 6026 |
+
"epoch": 0.033539543023726304,
|
| 6027 |
+
"grad_norm": 11.75,
|
| 6028 |
+
"learning_rate": 0.0004945711292922344,
|
| 6029 |
+
"loss": 20.1428,
|
| 6030 |
+
"step": 17200
|
| 6031 |
+
},
|
| 6032 |
+
{
|
| 6033 |
+
"epoch": 0.03357854249235854,
|
| 6034 |
+
"grad_norm": 10.8125,
|
| 6035 |
+
"learning_rate": 0.0004945646272613251,
|
| 6036 |
+
"loss": 20.0407,
|
| 6037 |
+
"step": 17220
|
| 6038 |
+
},
|
| 6039 |
+
{
|
| 6040 |
+
"epoch": 0.03361754196099078,
|
| 6041 |
+
"grad_norm": 10.5625,
|
| 6042 |
+
"learning_rate": 0.0004945581252304157,
|
| 6043 |
+
"loss": 20.1396,
|
| 6044 |
+
"step": 17240
|
| 6045 |
+
},
|
| 6046 |
+
{
|
| 6047 |
+
"epoch": 0.03365654142962302,
|
| 6048 |
+
"grad_norm": 10.0,
|
| 6049 |
+
"learning_rate": 0.0004945516231995064,
|
| 6050 |
+
"loss": 20.0334,
|
| 6051 |
+
"step": 17260
|
| 6052 |
+
},
|
| 6053 |
+
{
|
| 6054 |
+
"epoch": 0.03369554089825526,
|
| 6055 |
+
"grad_norm": 9.875,
|
| 6056 |
+
"learning_rate": 0.0004945451211685971,
|
| 6057 |
+
"loss": 19.9909,
|
| 6058 |
+
"step": 17280
|
| 6059 |
+
},
|
| 6060 |
+
{
|
| 6061 |
+
"epoch": 0.0337345403668875,
|
| 6062 |
+
"grad_norm": 12.4375,
|
| 6063 |
+
"learning_rate": 0.0004945386191376876,
|
| 6064 |
+
"loss": 20.0374,
|
| 6065 |
+
"step": 17300
|
| 6066 |
+
},
|
| 6067 |
+
{
|
| 6068 |
+
"epoch": 0.03377353983551974,
|
| 6069 |
+
"grad_norm": 9.75,
|
| 6070 |
+
"learning_rate": 0.0004945321171067783,
|
| 6071 |
+
"loss": 20.0703,
|
| 6072 |
+
"step": 17320
|
| 6073 |
+
},
|
| 6074 |
+
{
|
| 6075 |
+
"epoch": 0.033812539304151984,
|
| 6076 |
+
"grad_norm": 11.375,
|
| 6077 |
+
"learning_rate": 0.0004945256150758689,
|
| 6078 |
+
"loss": 19.9489,
|
| 6079 |
+
"step": 17340
|
| 6080 |
+
},
|
| 6081 |
+
{
|
| 6082 |
+
"epoch": 0.03385153877278422,
|
| 6083 |
+
"grad_norm": 11.3125,
|
| 6084 |
+
"learning_rate": 0.0004945191130449596,
|
| 6085 |
+
"loss": 19.9904,
|
| 6086 |
+
"step": 17360
|
| 6087 |
+
},
|
| 6088 |
+
{
|
| 6089 |
+
"epoch": 0.03389053824141646,
|
| 6090 |
+
"grad_norm": 10.3125,
|
| 6091 |
+
"learning_rate": 0.0004945126110140502,
|
| 6092 |
+
"loss": 19.9895,
|
| 6093 |
+
"step": 17380
|
| 6094 |
+
},
|
| 6095 |
+
{
|
| 6096 |
+
"epoch": 0.0339295377100487,
|
| 6097 |
+
"grad_norm": 11.0,
|
| 6098 |
+
"learning_rate": 0.0004945061089831409,
|
| 6099 |
+
"loss": 20.0525,
|
| 6100 |
+
"step": 17400
|
| 6101 |
+
},
|
| 6102 |
+
{
|
| 6103 |
+
"epoch": 0.03396853717868094,
|
| 6104 |
+
"grad_norm": 10.0,
|
| 6105 |
+
"learning_rate": 0.0004944996069522316,
|
| 6106 |
+
"loss": 20.0451,
|
| 6107 |
+
"step": 17420
|
| 6108 |
+
},
|
| 6109 |
+
{
|
| 6110 |
+
"epoch": 0.03400753664731318,
|
| 6111 |
+
"grad_norm": 10.625,
|
| 6112 |
+
"learning_rate": 0.0004944931049213222,
|
| 6113 |
+
"loss": 20.0506,
|
| 6114 |
+
"step": 17440
|
| 6115 |
+
},
|
| 6116 |
+
{
|
| 6117 |
+
"epoch": 0.03404653611594542,
|
| 6118 |
+
"grad_norm": 10.4375,
|
| 6119 |
+
"learning_rate": 0.0004944866028904128,
|
| 6120 |
+
"loss": 19.9625,
|
| 6121 |
+
"step": 17460
|
| 6122 |
+
},
|
| 6123 |
+
{
|
| 6124 |
+
"epoch": 0.03408553558457766,
|
| 6125 |
+
"grad_norm": 11.625,
|
| 6126 |
+
"learning_rate": 0.0004944801008595034,
|
| 6127 |
+
"loss": 19.995,
|
| 6128 |
+
"step": 17480
|
| 6129 |
+
},
|
| 6130 |
+
{
|
| 6131 |
+
"epoch": 0.0341245350532099,
|
| 6132 |
+
"grad_norm": 11.0625,
|
| 6133 |
+
"learning_rate": 0.0004944735988285941,
|
| 6134 |
+
"loss": 20.1062,
|
| 6135 |
+
"step": 17500
|
| 6136 |
+
},
|
| 6137 |
+
{
|
| 6138 |
+
"epoch": 0.03416353452184214,
|
| 6139 |
+
"grad_norm": 10.4375,
|
| 6140 |
+
"learning_rate": 0.0004944670967976847,
|
| 6141 |
+
"loss": 19.9454,
|
| 6142 |
+
"step": 17520
|
| 6143 |
+
},
|
| 6144 |
+
{
|
| 6145 |
+
"epoch": 0.034202533990474376,
|
| 6146 |
+
"grad_norm": 9.3125,
|
| 6147 |
+
"learning_rate": 0.0004944605947667754,
|
| 6148 |
+
"loss": 19.8752,
|
| 6149 |
+
"step": 17540
|
| 6150 |
+
},
|
| 6151 |
+
{
|
| 6152 |
+
"epoch": 0.03424153345910662,
|
| 6153 |
+
"grad_norm": 10.375,
|
| 6154 |
+
"learning_rate": 0.000494454092735866,
|
| 6155 |
+
"loss": 19.9649,
|
| 6156 |
+
"step": 17560
|
| 6157 |
+
},
|
| 6158 |
+
{
|
| 6159 |
+
"epoch": 0.03428053292773886,
|
| 6160 |
+
"grad_norm": 10.125,
|
| 6161 |
+
"learning_rate": 0.0004944475907049567,
|
| 6162 |
+
"loss": 19.9261,
|
| 6163 |
+
"step": 17580
|
| 6164 |
+
},
|
| 6165 |
+
{
|
| 6166 |
+
"epoch": 0.0343195323963711,
|
| 6167 |
+
"grad_norm": 9.875,
|
| 6168 |
+
"learning_rate": 0.0004944410886740474,
|
| 6169 |
+
"loss": 19.909,
|
| 6170 |
+
"step": 17600
|
| 6171 |
+
},
|
| 6172 |
+
{
|
| 6173 |
+
"epoch": 0.03435853186500334,
|
| 6174 |
+
"grad_norm": 10.75,
|
| 6175 |
+
"learning_rate": 0.000494434586643138,
|
| 6176 |
+
"loss": 19.9778,
|
| 6177 |
+
"step": 17620
|
| 6178 |
+
},
|
| 6179 |
+
{
|
| 6180 |
+
"epoch": 0.03439753133363558,
|
| 6181 |
+
"grad_norm": 11.5,
|
| 6182 |
+
"learning_rate": 0.0004944280846122287,
|
| 6183 |
+
"loss": 19.9709,
|
| 6184 |
+
"step": 17640
|
| 6185 |
+
},
|
| 6186 |
+
{
|
| 6187 |
+
"epoch": 0.03443653080226782,
|
| 6188 |
+
"grad_norm": 11.125,
|
| 6189 |
+
"learning_rate": 0.0004944215825813193,
|
| 6190 |
+
"loss": 19.9898,
|
| 6191 |
+
"step": 17660
|
| 6192 |
+
},
|
| 6193 |
+
{
|
| 6194 |
+
"epoch": 0.03447553027090006,
|
| 6195 |
+
"grad_norm": 10.5625,
|
| 6196 |
+
"learning_rate": 0.00049441508055041,
|
| 6197 |
+
"loss": 19.9979,
|
| 6198 |
+
"step": 17680
|
| 6199 |
+
},
|
| 6200 |
+
{
|
| 6201 |
+
"epoch": 0.0345145297395323,
|
| 6202 |
+
"grad_norm": 9.5625,
|
| 6203 |
+
"learning_rate": 0.0004944085785195005,
|
| 6204 |
+
"loss": 19.9206,
|
| 6205 |
+
"step": 17700
|
| 6206 |
+
},
|
| 6207 |
+
{
|
| 6208 |
+
"epoch": 0.03455352920816454,
|
| 6209 |
+
"grad_norm": 11.0,
|
| 6210 |
+
"learning_rate": 0.0004944020764885912,
|
| 6211 |
+
"loss": 19.9701,
|
| 6212 |
+
"step": 17720
|
| 6213 |
+
},
|
| 6214 |
+
{
|
| 6215 |
+
"epoch": 0.034592528676796776,
|
| 6216 |
+
"grad_norm": 10.75,
|
| 6217 |
+
"learning_rate": 0.0004943955744576818,
|
| 6218 |
+
"loss": 19.9937,
|
| 6219 |
+
"step": 17740
|
| 6220 |
+
},
|
| 6221 |
+
{
|
| 6222 |
+
"epoch": 0.03463152814542902,
|
| 6223 |
+
"grad_norm": 12.5625,
|
| 6224 |
+
"learning_rate": 0.0004943890724267725,
|
| 6225 |
+
"loss": 20.0349,
|
| 6226 |
+
"step": 17760
|
| 6227 |
+
},
|
| 6228 |
+
{
|
| 6229 |
+
"epoch": 0.03467052761406126,
|
| 6230 |
+
"grad_norm": 11.3125,
|
| 6231 |
+
"learning_rate": 0.0004943825703958632,
|
| 6232 |
+
"loss": 19.8582,
|
| 6233 |
+
"step": 17780
|
| 6234 |
+
},
|
| 6235 |
+
{
|
| 6236 |
+
"epoch": 0.034709527082693495,
|
| 6237 |
+
"grad_norm": 12.125,
|
| 6238 |
+
"learning_rate": 0.0004943760683649538,
|
| 6239 |
+
"loss": 19.9185,
|
| 6240 |
+
"step": 17800
|
| 6241 |
+
},
|
| 6242 |
+
{
|
| 6243 |
+
"epoch": 0.03474852655132574,
|
| 6244 |
+
"grad_norm": 10.625,
|
| 6245 |
+
"learning_rate": 0.0004943695663340445,
|
| 6246 |
+
"loss": 19.9073,
|
| 6247 |
+
"step": 17820
|
| 6248 |
+
},
|
| 6249 |
+
{
|
| 6250 |
+
"epoch": 0.03478752601995798,
|
| 6251 |
+
"grad_norm": 9.8125,
|
| 6252 |
+
"learning_rate": 0.0004943630643031351,
|
| 6253 |
+
"loss": 19.8189,
|
| 6254 |
+
"step": 17840
|
| 6255 |
+
},
|
| 6256 |
+
{
|
| 6257 |
+
"epoch": 0.03482652548859022,
|
| 6258 |
+
"grad_norm": 12.375,
|
| 6259 |
+
"learning_rate": 0.0004943565622722258,
|
| 6260 |
+
"loss": 20.0152,
|
| 6261 |
+
"step": 17860
|
| 6262 |
+
},
|
| 6263 |
+
{
|
| 6264 |
+
"epoch": 0.034865524957222456,
|
| 6265 |
+
"grad_norm": 10.0,
|
| 6266 |
+
"learning_rate": 0.0004943500602413164,
|
| 6267 |
+
"loss": 19.9768,
|
| 6268 |
+
"step": 17880
|
| 6269 |
+
},
|
| 6270 |
+
{
|
| 6271 |
+
"epoch": 0.0349045244258547,
|
| 6272 |
+
"grad_norm": 10.5,
|
| 6273 |
+
"learning_rate": 0.0004943435582104071,
|
| 6274 |
+
"loss": 19.9124,
|
| 6275 |
+
"step": 17900
|
| 6276 |
+
},
|
| 6277 |
+
{
|
| 6278 |
+
"epoch": 0.03494352389448694,
|
| 6279 |
+
"grad_norm": 9.5,
|
| 6280 |
+
"learning_rate": 0.0004943370561794977,
|
| 6281 |
+
"loss": 19.8925,
|
| 6282 |
+
"step": 17920
|
| 6283 |
+
},
|
| 6284 |
+
{
|
| 6285 |
+
"epoch": 0.034982523363119175,
|
| 6286 |
+
"grad_norm": 8.75,
|
| 6287 |
+
"learning_rate": 0.0004943305541485883,
|
| 6288 |
+
"loss": 19.9456,
|
| 6289 |
+
"step": 17940
|
| 6290 |
+
},
|
| 6291 |
+
{
|
| 6292 |
+
"epoch": 0.03502152283175142,
|
| 6293 |
+
"grad_norm": 10.625,
|
| 6294 |
+
"learning_rate": 0.000494324052117679,
|
| 6295 |
+
"loss": 19.8603,
|
| 6296 |
+
"step": 17960
|
| 6297 |
+
},
|
| 6298 |
+
{
|
| 6299 |
+
"epoch": 0.03506052230038366,
|
| 6300 |
+
"grad_norm": 10.25,
|
| 6301 |
+
"learning_rate": 0.0004943175500867696,
|
| 6302 |
+
"loss": 19.909,
|
| 6303 |
+
"step": 17980
|
| 6304 |
+
},
|
| 6305 |
+
{
|
| 6306 |
+
"epoch": 0.035099521769015894,
|
| 6307 |
+
"grad_norm": 9.5625,
|
| 6308 |
+
"learning_rate": 0.0004943110480558603,
|
| 6309 |
+
"loss": 19.8528,
|
| 6310 |
+
"step": 18000
|
| 6311 |
}
|
| 6312 |
],
|
| 6313 |
"logging_steps": 20,
|
|
|
|
| 6327 |
"attributes": {}
|
| 6328 |
}
|
| 6329 |
},
|
| 6330 |
+
"total_flos": 1.3232995623550058e+19,
|
| 6331 |
"train_batch_size": 48,
|
| 6332 |
"trial_name": null,
|
| 6333 |
"trial_params": null
|