Training in progress, step 3300, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 3237829088
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:365157e0cc91c6ea82754070aed20459af7616cdd87d96a38b0933e4ebe719a6
|
| 3 |
size 3237829088
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 2062272049
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bca2f83c7c62bb2baeb05f97ac5a95135b02a1d4757160680bb94bbe4a6a7b0a
|
| 3 |
size 2062272049
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14645
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e2f931a6aed50b06e410ca372eb5f503052ee3f5905b5b560a45a62d502dc2ff
|
| 3 |
size 14645
|
last-checkpoint/scaler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1383
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:92459d776349d1cc2d4327d5ed9e474de76e06b8e6491efc16a39d8110d2a844
|
| 3 |
size 1383
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9d134eca5097bea9b8988d832ccb2da62b2a551181674ab666a19da6cf129c3d
|
| 3 |
size 1465
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 300,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -2108,6 +2108,216 @@
|
|
| 2108 |
"learning_rate": 9.943054290774756e-06,
|
| 2109 |
"loss": 0.7574,
|
| 2110 |
"step": 3000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2111 |
}
|
| 2112 |
],
|
| 2113 |
"logging_steps": 10,
|
|
@@ -2127,7 +2337,7 @@
|
|
| 2127 |
"attributes": {}
|
| 2128 |
}
|
| 2129 |
},
|
| 2130 |
-
"total_flos": 1.
|
| 2131 |
"train_batch_size": 6,
|
| 2132 |
"trial_name": null,
|
| 2133 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.9470512268618166,
|
| 6 |
"eval_steps": 300,
|
| 7 |
+
"global_step": 3300,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 2108 |
"learning_rate": 9.943054290774756e-06,
|
| 2109 |
"loss": 0.7574,
|
| 2110 |
"step": 3000
|
| 2111 |
+
},
|
| 2112 |
+
{
|
| 2113 |
+
"epoch": 0.8638255129860812,
|
| 2114 |
+
"grad_norm": 5.900289535522461,
|
| 2115 |
+
"learning_rate": 9.547076661622922e-06,
|
| 2116 |
+
"loss": 0.7758,
|
| 2117 |
+
"step": 3010
|
| 2118 |
+
},
|
| 2119 |
+
{
|
| 2120 |
+
"epoch": 0.8666953651886928,
|
| 2121 |
+
"grad_norm": 5.241759777069092,
|
| 2122 |
+
"learning_rate": 9.15874942234024e-06,
|
| 2123 |
+
"loss": 0.7805,
|
| 2124 |
+
"step": 3020
|
| 2125 |
+
},
|
| 2126 |
+
{
|
| 2127 |
+
"epoch": 0.8695652173913043,
|
| 2128 |
+
"grad_norm": 4.609664440155029,
|
| 2129 |
+
"learning_rate": 8.778105417136395e-06,
|
| 2130 |
+
"loss": 0.7642,
|
| 2131 |
+
"step": 3030
|
| 2132 |
+
},
|
| 2133 |
+
{
|
| 2134 |
+
"epoch": 0.8724350695939159,
|
| 2135 |
+
"grad_norm": 6.470444202423096,
|
| 2136 |
+
"learning_rate": 8.405176840383122e-06,
|
| 2137 |
+
"loss": 0.7928,
|
| 2138 |
+
"step": 3040
|
| 2139 |
+
},
|
| 2140 |
+
{
|
| 2141 |
+
"epoch": 0.8753049217965275,
|
| 2142 |
+
"grad_norm": 3.531794786453247,
|
| 2143 |
+
"learning_rate": 8.039995233891362e-06,
|
| 2144 |
+
"loss": 0.7503,
|
| 2145 |
+
"step": 3050
|
| 2146 |
+
},
|
| 2147 |
+
{
|
| 2148 |
+
"epoch": 0.878174773999139,
|
| 2149 |
+
"grad_norm": 5.537559986114502,
|
| 2150 |
+
"learning_rate": 7.682591484243417e-06,
|
| 2151 |
+
"loss": 0.7343,
|
| 2152 |
+
"step": 3060
|
| 2153 |
+
},
|
| 2154 |
+
{
|
| 2155 |
+
"epoch": 0.8810446262017506,
|
| 2156 |
+
"grad_norm": 3.7967238426208496,
|
| 2157 |
+
"learning_rate": 7.332995820180677e-06,
|
| 2158 |
+
"loss": 0.7345,
|
| 2159 |
+
"step": 3070
|
| 2160 |
+
},
|
| 2161 |
+
{
|
| 2162 |
+
"epoch": 0.8839144784043622,
|
| 2163 |
+
"grad_norm": 4.1268839836120605,
|
| 2164 |
+
"learning_rate": 6.991237810046847e-06,
|
| 2165 |
+
"loss": 0.7557,
|
| 2166 |
+
"step": 3080
|
| 2167 |
+
},
|
| 2168 |
+
{
|
| 2169 |
+
"epoch": 0.8867843306069737,
|
| 2170 |
+
"grad_norm": 7.182312965393066,
|
| 2171 |
+
"learning_rate": 6.6573463592871085e-06,
|
| 2172 |
+
"loss": 0.7635,
|
| 2173 |
+
"step": 3090
|
| 2174 |
+
},
|
| 2175 |
+
{
|
| 2176 |
+
"epoch": 0.8896541828095853,
|
| 2177 |
+
"grad_norm": 3.4768388271331787,
|
| 2178 |
+
"learning_rate": 6.331349708003365e-06,
|
| 2179 |
+
"loss": 0.7325,
|
| 2180 |
+
"step": 3100
|
| 2181 |
+
},
|
| 2182 |
+
{
|
| 2183 |
+
"epoch": 0.8925240350121969,
|
| 2184 |
+
"grad_norm": 5.252262115478516,
|
| 2185 |
+
"learning_rate": 6.013275428565712e-06,
|
| 2186 |
+
"loss": 0.7513,
|
| 2187 |
+
"step": 3110
|
| 2188 |
+
},
|
| 2189 |
+
{
|
| 2190 |
+
"epoch": 0.8953938872148084,
|
| 2191 |
+
"grad_norm": 4.213047027587891,
|
| 2192 |
+
"learning_rate": 5.703150423280401e-06,
|
| 2193 |
+
"loss": 0.7685,
|
| 2194 |
+
"step": 3120
|
| 2195 |
+
},
|
| 2196 |
+
{
|
| 2197 |
+
"epoch": 0.89826373941742,
|
| 2198 |
+
"grad_norm": 4.207084655761719,
|
| 2199 |
+
"learning_rate": 5.401000922114485e-06,
|
| 2200 |
+
"loss": 0.7313,
|
| 2201 |
+
"step": 3130
|
| 2202 |
+
},
|
| 2203 |
+
{
|
| 2204 |
+
"epoch": 0.9011335916200316,
|
| 2205 |
+
"grad_norm": 6.862100124359131,
|
| 2206 |
+
"learning_rate": 5.10685248047732e-06,
|
| 2207 |
+
"loss": 0.7626,
|
| 2208 |
+
"step": 3140
|
| 2209 |
+
},
|
| 2210 |
+
{
|
| 2211 |
+
"epoch": 0.9040034438226431,
|
| 2212 |
+
"grad_norm": 3.541048049926758,
|
| 2213 |
+
"learning_rate": 4.82072997705908e-06,
|
| 2214 |
+
"loss": 0.7748,
|
| 2215 |
+
"step": 3150
|
| 2216 |
+
},
|
| 2217 |
+
{
|
| 2218 |
+
"epoch": 0.9068732960252547,
|
| 2219 |
+
"grad_norm": 4.149963855743408,
|
| 2220 |
+
"learning_rate": 4.542657611726664e-06,
|
| 2221 |
+
"loss": 0.7651,
|
| 2222 |
+
"step": 3160
|
| 2223 |
+
},
|
| 2224 |
+
{
|
| 2225 |
+
"epoch": 0.9097431482278663,
|
| 2226 |
+
"grad_norm": 6.455443859100342,
|
| 2227 |
+
"learning_rate": 4.272658903476745e-06,
|
| 2228 |
+
"loss": 0.7769,
|
| 2229 |
+
"step": 3170
|
| 2230 |
+
},
|
| 2231 |
+
{
|
| 2232 |
+
"epoch": 0.9126130004304779,
|
| 2233 |
+
"grad_norm": 5.111416339874268,
|
| 2234 |
+
"learning_rate": 4.010756688446726e-06,
|
| 2235 |
+
"loss": 0.779,
|
| 2236 |
+
"step": 3180
|
| 2237 |
+
},
|
| 2238 |
+
{
|
| 2239 |
+
"epoch": 0.9154828526330894,
|
| 2240 |
+
"grad_norm": 5.0384440422058105,
|
| 2241 |
+
"learning_rate": 3.7569731179831537e-06,
|
| 2242 |
+
"loss": 0.7353,
|
| 2243 |
+
"step": 3190
|
| 2244 |
+
},
|
| 2245 |
+
{
|
| 2246 |
+
"epoch": 0.918352704835701,
|
| 2247 |
+
"grad_norm": 4.619420528411865,
|
| 2248 |
+
"learning_rate": 3.5113296567682476e-06,
|
| 2249 |
+
"loss": 0.7686,
|
| 2250 |
+
"step": 3200
|
| 2251 |
+
},
|
| 2252 |
+
{
|
| 2253 |
+
"epoch": 0.9212225570383126,
|
| 2254 |
+
"grad_norm": 5.13969612121582,
|
| 2255 |
+
"learning_rate": 3.2738470810044553e-06,
|
| 2256 |
+
"loss": 0.7475,
|
| 2257 |
+
"step": 3210
|
| 2258 |
+
},
|
| 2259 |
+
{
|
| 2260 |
+
"epoch": 0.9240924092409241,
|
| 2261 |
+
"grad_norm": 4.138948917388916,
|
| 2262 |
+
"learning_rate": 3.0445454766572235e-06,
|
| 2263 |
+
"loss": 0.743,
|
| 2264 |
+
"step": 3220
|
| 2265 |
+
},
|
| 2266 |
+
{
|
| 2267 |
+
"epoch": 0.9269622614435357,
|
| 2268 |
+
"grad_norm": 3.4994235038757324,
|
| 2269 |
+
"learning_rate": 2.8234442377561232e-06,
|
| 2270 |
+
"loss": 0.7491,
|
| 2271 |
+
"step": 3230
|
| 2272 |
+
},
|
| 2273 |
+
{
|
| 2274 |
+
"epoch": 0.9298321136461473,
|
| 2275 |
+
"grad_norm": 3.714160442352295,
|
| 2276 |
+
"learning_rate": 2.6105620647545734e-06,
|
| 2277 |
+
"loss": 0.7516,
|
| 2278 |
+
"step": 3240
|
| 2279 |
+
},
|
| 2280 |
+
{
|
| 2281 |
+
"epoch": 0.9327019658487588,
|
| 2282 |
+
"grad_norm": 3.1646008491516113,
|
| 2283 |
+
"learning_rate": 2.4059169629481403e-06,
|
| 2284 |
+
"loss": 0.751,
|
| 2285 |
+
"step": 3250
|
| 2286 |
+
},
|
| 2287 |
+
{
|
| 2288 |
+
"epoch": 0.9355718180513704,
|
| 2289 |
+
"grad_norm": 4.828333377838135,
|
| 2290 |
+
"learning_rate": 2.209526240951665e-06,
|
| 2291 |
+
"loss": 0.741,
|
| 2292 |
+
"step": 3260
|
| 2293 |
+
},
|
| 2294 |
+
{
|
| 2295 |
+
"epoch": 0.938441670253982,
|
| 2296 |
+
"grad_norm": 3.3315179347991943,
|
| 2297 |
+
"learning_rate": 2.021406509235402e-06,
|
| 2298 |
+
"loss": 0.7554,
|
| 2299 |
+
"step": 3270
|
| 2300 |
+
},
|
| 2301 |
+
{
|
| 2302 |
+
"epoch": 0.9413115224565934,
|
| 2303 |
+
"grad_norm": 6.141576766967773,
|
| 2304 |
+
"learning_rate": 1.8415736787200433e-06,
|
| 2305 |
+
"loss": 0.7465,
|
| 2306 |
+
"step": 3280
|
| 2307 |
+
},
|
| 2308 |
+
{
|
| 2309 |
+
"epoch": 0.944181374659205,
|
| 2310 |
+
"grad_norm": 4.839749336242676,
|
| 2311 |
+
"learning_rate": 1.6700429594310063e-06,
|
| 2312 |
+
"loss": 0.761,
|
| 2313 |
+
"step": 3290
|
| 2314 |
+
},
|
| 2315 |
+
{
|
| 2316 |
+
"epoch": 0.9470512268618166,
|
| 2317 |
+
"grad_norm": 4.683228969573975,
|
| 2318 |
+
"learning_rate": 1.5068288592120283e-06,
|
| 2319 |
+
"loss": 0.751,
|
| 2320 |
+
"step": 3300
|
| 2321 |
}
|
| 2322 |
],
|
| 2323 |
"logging_steps": 10,
|
|
|
|
| 2337 |
"attributes": {}
|
| 2338 |
}
|
| 2339 |
},
|
| 2340 |
+
"total_flos": 1.35032131289088e+20,
|
| 2341 |
"train_batch_size": 6,
|
| 2342 |
"trial_name": null,
|
| 2343 |
"trial_params": null
|