3N3G commited on
Commit
4849640
·
verified ·
1 Parent(s): 8b2651e

Training in progress, step 320, checkpoint

Browse files
last-checkpoint/model-00001-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7ad5883a3cee7c47736b45f9790afc68bdd5c731da8985e21a6cb6b18802bee1
3
  size 4969539560
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fbb5a892c7bcd146f6eee0ac485a21efd55a8ffbc4d42c3ed0640fc44f041c63
3
  size 4969539560
last-checkpoint/model-00002-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:652f38f7fb6d6d424d146395fc676655eef11c276b4cc3a9e7a2d7d530069500
3
  size 1912795688
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f51a049f43d70a0eb667a892f9ae752e6fde71c884bf47a93a4de8f13fcb645
3
  size 1912795688
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 72.0,
6
  "eval_steps": 16,
7
- "global_step": 288,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -2168,6 +2168,246 @@
2168
  "eval_samples_per_second": 16.986,
2169
  "eval_steps_per_second": 16.986,
2170
  "step": 288
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2171
  }
2172
  ],
2173
  "logging_steps": 1,
@@ -2187,7 +2427,7 @@
2187
  "attributes": {}
2188
  }
2189
  },
2190
- "total_flos": 9.659036592635904e+16,
2191
  "train_batch_size": 1,
2192
  "trial_name": null,
2193
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 80.0,
6
  "eval_steps": 16,
7
+ "global_step": 320,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
2168
  "eval_samples_per_second": 16.986,
2169
  "eval_steps_per_second": 16.986,
2170
  "step": 288
2171
+ },
2172
+ {
2173
+ "epoch": 72.2909090909091,
2174
+ "grad_norm": 3.4279274940490723,
2175
+ "learning_rate": 2.983631934381639e-08,
2176
+ "loss": 0.6219,
2177
+ "step": 289
2178
+ },
2179
+ {
2180
+ "epoch": 72.58181818181818,
2181
+ "grad_norm": 3.627363681793213,
2182
+ "learning_rate": 2.9511719338382535e-08,
2183
+ "loss": 0.6635,
2184
+ "step": 290
2185
+ },
2186
+ {
2187
+ "epoch": 72.87272727272727,
2188
+ "grad_norm": 3.1634864807128906,
2189
+ "learning_rate": 2.918906036420294e-08,
2190
+ "loss": 0.6377,
2191
+ "step": 291
2192
+ },
2193
+ {
2194
+ "epoch": 73.0,
2195
+ "grad_norm": 3.896449327468872,
2196
+ "learning_rate": 2.886836699300771e-08,
2197
+ "loss": 0.7822,
2198
+ "step": 292
2199
+ },
2200
+ {
2201
+ "epoch": 73.2909090909091,
2202
+ "grad_norm": 3.168968677520752,
2203
+ "learning_rate": 2.8549663646838718e-08,
2204
+ "loss": 0.609,
2205
+ "step": 293
2206
+ },
2207
+ {
2208
+ "epoch": 73.58181818181818,
2209
+ "grad_norm": 3.3781349658966064,
2210
+ "learning_rate": 2.8232974596189653e-08,
2211
+ "loss": 0.6728,
2212
+ "step": 294
2213
+ },
2214
+ {
2215
+ "epoch": 73.87272727272727,
2216
+ "grad_norm": 3.41473650932312,
2217
+ "learning_rate": 2.791832395815782e-08,
2218
+ "loss": 0.7013,
2219
+ "step": 295
2220
+ },
2221
+ {
2222
+ "epoch": 74.0,
2223
+ "grad_norm": 3.771911859512329,
2224
+ "learning_rate": 2.760573569460757e-08,
2225
+ "loss": 0.6343,
2226
+ "step": 296
2227
+ },
2228
+ {
2229
+ "epoch": 74.2909090909091,
2230
+ "grad_norm": 3.527878761291504,
2231
+ "learning_rate": 2.729523361034538e-08,
2232
+ "loss": 0.6528,
2233
+ "step": 297
2234
+ },
2235
+ {
2236
+ "epoch": 74.58181818181818,
2237
+ "grad_norm": 3.105755090713501,
2238
+ "learning_rate": 2.6986841351307128e-08,
2239
+ "loss": 0.6243,
2240
+ "step": 298
2241
+ },
2242
+ {
2243
+ "epoch": 74.87272727272727,
2244
+ "grad_norm": 3.3217263221740723,
2245
+ "learning_rate": 2.6680582402757322e-08,
2246
+ "loss": 0.6658,
2247
+ "step": 299
2248
+ },
2249
+ {
2250
+ "epoch": 75.0,
2251
+ "grad_norm": 4.193359375,
2252
+ "learning_rate": 2.637648008750062e-08,
2253
+ "loss": 0.7016,
2254
+ "step": 300
2255
+ },
2256
+ {
2257
+ "epoch": 75.2909090909091,
2258
+ "grad_norm": 3.2874765396118164,
2259
+ "learning_rate": 2.6074557564105726e-08,
2260
+ "loss": 0.6661,
2261
+ "step": 301
2262
+ },
2263
+ {
2264
+ "epoch": 75.58181818181818,
2265
+ "grad_norm": 3.4806275367736816,
2266
+ "learning_rate": 2.5774837825141737e-08,
2267
+ "loss": 0.6277,
2268
+ "step": 302
2269
+ },
2270
+ {
2271
+ "epoch": 75.87272727272727,
2272
+ "grad_norm": 3.398120880126953,
2273
+ "learning_rate": 2.547734369542718e-08,
2274
+ "loss": 0.6863,
2275
+ "step": 303
2276
+ },
2277
+ {
2278
+ "epoch": 76.0,
2279
+ "grad_norm": 3.1762161254882812,
2280
+ "learning_rate": 2.5182097830291825e-08,
2281
+ "loss": 0.648,
2282
+ "step": 304
2283
+ },
2284
+ {
2285
+ "epoch": 76.0,
2286
+ "eval_loss": 0.6407743096351624,
2287
+ "eval_runtime": 0.7838,
2288
+ "eval_samples_per_second": 16.585,
2289
+ "eval_steps_per_second": 16.585,
2290
+ "step": 304
2291
+ },
2292
+ {
2293
+ "epoch": 76.2909090909091,
2294
+ "grad_norm": 3.250011444091797,
2295
+ "learning_rate": 2.4889122713851394e-08,
2296
+ "loss": 0.6552,
2297
+ "step": 305
2298
+ },
2299
+ {
2300
+ "epoch": 76.58181818181818,
2301
+ "grad_norm": 3.1045658588409424,
2302
+ "learning_rate": 2.4598440657295288e-08,
2303
+ "loss": 0.6147,
2304
+ "step": 306
2305
+ },
2306
+ {
2307
+ "epoch": 76.87272727272727,
2308
+ "grad_norm": 4.007096290588379,
2309
+ "learning_rate": 2.4310073797187574e-08,
2310
+ "loss": 0.7181,
2311
+ "step": 307
2312
+ },
2313
+ {
2314
+ "epoch": 77.0,
2315
+ "grad_norm": 3.300295829772949,
2316
+ "learning_rate": 2.4024044093781064e-08,
2317
+ "loss": 0.6115,
2318
+ "step": 308
2319
+ },
2320
+ {
2321
+ "epoch": 77.2909090909091,
2322
+ "grad_norm": 3.376610517501831,
2323
+ "learning_rate": 2.3740373329345117e-08,
2324
+ "loss": 0.7065,
2325
+ "step": 309
2326
+ },
2327
+ {
2328
+ "epoch": 77.58181818181818,
2329
+ "grad_norm": 3.1987497806549072,
2330
+ "learning_rate": 2.3459083106506712e-08,
2331
+ "loss": 0.6265,
2332
+ "step": 310
2333
+ },
2334
+ {
2335
+ "epoch": 77.87272727272727,
2336
+ "grad_norm": 3.428140878677368,
2337
+ "learning_rate": 2.3180194846605363e-08,
2338
+ "loss": 0.629,
2339
+ "step": 311
2340
+ },
2341
+ {
2342
+ "epoch": 78.0,
2343
+ "grad_norm": 3.489027261734009,
2344
+ "learning_rate": 2.2903729788061836e-08,
2345
+ "loss": 0.6626,
2346
+ "step": 312
2347
+ },
2348
+ {
2349
+ "epoch": 78.2909090909091,
2350
+ "grad_norm": 3.7477946281433105,
2351
+ "learning_rate": 2.2629708984760707e-08,
2352
+ "loss": 0.7006,
2353
+ "step": 313
2354
+ },
2355
+ {
2356
+ "epoch": 78.58181818181818,
2357
+ "grad_norm": 3.2413809299468994,
2358
+ "learning_rate": 2.2358153304447067e-08,
2359
+ "loss": 0.6363,
2360
+ "step": 314
2361
+ },
2362
+ {
2363
+ "epoch": 78.87272727272727,
2364
+ "grad_norm": 3.0365958213806152,
2365
+ "learning_rate": 2.2089083427137328e-08,
2366
+ "loss": 0.6307,
2367
+ "step": 315
2368
+ },
2369
+ {
2370
+ "epoch": 79.0,
2371
+ "grad_norm": 3.5392417907714844,
2372
+ "learning_rate": 2.182251984354442e-08,
2373
+ "loss": 0.6594,
2374
+ "step": 316
2375
+ },
2376
+ {
2377
+ "epoch": 79.2909090909091,
2378
+ "grad_norm": 3.2169861793518066,
2379
+ "learning_rate": 2.1558482853517254e-08,
2380
+ "loss": 0.6261,
2381
+ "step": 317
2382
+ },
2383
+ {
2384
+ "epoch": 79.58181818181818,
2385
+ "grad_norm": 3.1975908279418945,
2386
+ "learning_rate": 2.1296992564494903e-08,
2387
+ "loss": 0.6303,
2388
+ "step": 318
2389
+ },
2390
+ {
2391
+ "epoch": 79.87272727272727,
2392
+ "grad_norm": 3.5037009716033936,
2393
+ "learning_rate": 2.103806888997526e-08,
2394
+ "loss": 0.6847,
2395
+ "step": 319
2396
+ },
2397
+ {
2398
+ "epoch": 80.0,
2399
+ "grad_norm": 3.49397611618042,
2400
+ "learning_rate": 2.078173154799861e-08,
2401
+ "loss": 0.704,
2402
+ "step": 320
2403
+ },
2404
+ {
2405
+ "epoch": 80.0,
2406
+ "eval_loss": 0.6397803425788879,
2407
+ "eval_runtime": 0.7407,
2408
+ "eval_samples_per_second": 17.552,
2409
+ "eval_steps_per_second": 17.552,
2410
+ "step": 320
2411
  }
2412
  ],
2413
  "logging_steps": 1,
 
2427
  "attributes": {}
2428
  }
2429
  },
2430
+ "total_flos": 1.073226288070656e+17,
2431
  "train_batch_size": 1,
2432
  "trial_name": null,
2433
  "trial_params": null