8BitStudio commited on
Commit
39aae78
·
verified ·
1 Parent(s): 4a3d751

Training in progress, step 18000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1e3c2ca1453671908d126e303eba98dd0d57768bc3b1dcb8cf48dcbd5df11353
3
  size 1520630616
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0cc7670971a4e72c8112661889eb09f4dd0aef80b62662f3de10aa7539b0126d
3
  size 1520630616
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e3d6326aeb70f12a4b9828676ff7fb0f81b4f603e04b14d7e8b6337709d69892
3
  size 3041448587
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7fa9da6182eb634e04ef5b1e88adde1464e5173eb3d0eb66d882cdcb5bad981e
3
  size 3041448587
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:839155d8d479a4428e25ab272c147641fcc513d85570b8d0b1dcd722136156e9
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59533f6351ee8031370e4d884b963a0d20ec1f96789095c1a70c9891c4bf2301
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:027f96c69ce599f1f33b2261db2960f4a6aaefef410e2d604c54d3aa094ca9a9
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1fcb8a6684d40f0d667cf5f41391378b3c7ac8a01224006fa696bc503a2d8b3b
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 4.024349726775957,
6
  "eval_steps": 500,
7
- "global_step": 16000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -2248,6 +2248,286 @@
2248
  "learning_rate": 0.0002830845786762962,
2249
  "loss": 1.7125,
2250
  "step": 16000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2251
  }
2252
  ],
2253
  "logging_steps": 50,
@@ -2267,7 +2547,7 @@
2267
  "attributes": {}
2268
  }
2269
  },
2270
- "total_flos": 8.556472791069622e+18,
2271
  "train_batch_size": 16,
2272
  "trial_name": null,
2273
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 5.008579234972678,
6
  "eval_steps": 500,
7
+ "global_step": 18000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
2248
  "learning_rate": 0.0002830845786762962,
2249
  "loss": 1.7125,
2250
  "step": 16000
2251
+ },
2252
+ {
2253
+ "epoch": 4.024896174863388,
2254
+ "grad_norm": 0.6796875,
2255
+ "learning_rate": 0.0002829623770208463,
2256
+ "loss": 1.6678,
2257
+ "step": 16050
2258
+ },
2259
+ {
2260
+ "epoch": 4.02544262295082,
2261
+ "grad_norm": 0.55859375,
2262
+ "learning_rate": 0.00028283976211526137,
2263
+ "loss": 1.7396,
2264
+ "step": 16100
2265
+ },
2266
+ {
2267
+ "epoch": 4.025989071038252,
2268
+ "grad_norm": 0.578125,
2269
+ "learning_rate": 0.0002827167343406315,
2270
+ "loss": 1.752,
2271
+ "step": 16150
2272
+ },
2273
+ {
2274
+ "epoch": 4.026535519125683,
2275
+ "grad_norm": 0.56640625,
2276
+ "learning_rate": 0.0002825932940793298,
2277
+ "loss": 1.6994,
2278
+ "step": 16200
2279
+ },
2280
+ {
2281
+ "epoch": 4.027081967213115,
2282
+ "grad_norm": 0.6015625,
2283
+ "learning_rate": 0.00028246944171501145,
2284
+ "loss": 1.7189,
2285
+ "step": 16250
2286
+ },
2287
+ {
2288
+ "epoch": 4.027628415300547,
2289
+ "grad_norm": 0.5625,
2290
+ "learning_rate": 0.00028234517763261243,
2291
+ "loss": 1.7158,
2292
+ "step": 16300
2293
+ },
2294
+ {
2295
+ "epoch": 4.028174863387978,
2296
+ "grad_norm": 0.5625,
2297
+ "learning_rate": 0.00028222050221834847,
2298
+ "loss": 1.6885,
2299
+ "step": 16350
2300
+ },
2301
+ {
2302
+ "epoch": 4.02872131147541,
2303
+ "grad_norm": 0.5078125,
2304
+ "learning_rate": 0.0002820954158597134,
2305
+ "loss": 1.6865,
2306
+ "step": 16400
2307
+ },
2308
+ {
2309
+ "epoch": 4.029267759562842,
2310
+ "grad_norm": 0.52734375,
2311
+ "learning_rate": 0.0002819699189454788,
2312
+ "loss": 1.6946,
2313
+ "step": 16450
2314
+ },
2315
+ {
2316
+ "epoch": 4.029814207650273,
2317
+ "grad_norm": 0.5703125,
2318
+ "learning_rate": 0.0002818440118656918,
2319
+ "loss": 1.7194,
2320
+ "step": 16500
2321
+ },
2322
+ {
2323
+ "epoch": 4.030360655737705,
2324
+ "grad_norm": 0.52734375,
2325
+ "learning_rate": 0.0002817176950116746,
2326
+ "loss": 1.7022,
2327
+ "step": 16550
2328
+ },
2329
+ {
2330
+ "epoch": 4.030907103825137,
2331
+ "grad_norm": 0.61328125,
2332
+ "learning_rate": 0.00028159096877602275,
2333
+ "loss": 1.7159,
2334
+ "step": 16600
2335
+ },
2336
+ {
2337
+ "epoch": 4.031453551912568,
2338
+ "grad_norm": 0.5625,
2339
+ "learning_rate": 0.00028146383355260446,
2340
+ "loss": 1.7063,
2341
+ "step": 16650
2342
+ },
2343
+ {
2344
+ "epoch": 4.032,
2345
+ "grad_norm": 0.58984375,
2346
+ "learning_rate": 0.00028133628973655894,
2347
+ "loss": 1.7064,
2348
+ "step": 16700
2349
+ },
2350
+ {
2351
+ "epoch": 4.032546448087432,
2352
+ "grad_norm": 0.55859375,
2353
+ "learning_rate": 0.00028120833772429517,
2354
+ "loss": 1.6992,
2355
+ "step": 16750
2356
+ },
2357
+ {
2358
+ "epoch": 4.033092896174863,
2359
+ "grad_norm": 0.5546875,
2360
+ "learning_rate": 0.0002810799779134911,
2361
+ "loss": 1.7132,
2362
+ "step": 16800
2363
+ },
2364
+ {
2365
+ "epoch": 4.033639344262295,
2366
+ "grad_norm": 0.5390625,
2367
+ "learning_rate": 0.0002809512107030919,
2368
+ "loss": 1.7125,
2369
+ "step": 16850
2370
+ },
2371
+ {
2372
+ "epoch": 4.034185792349727,
2373
+ "grad_norm": 0.57421875,
2374
+ "learning_rate": 0.0002808220364933091,
2375
+ "loss": 1.6373,
2376
+ "step": 16900
2377
+ },
2378
+ {
2379
+ "epoch": 4.034732240437158,
2380
+ "grad_norm": 0.5234375,
2381
+ "learning_rate": 0.00028069245568561904,
2382
+ "loss": 1.7379,
2383
+ "step": 16950
2384
+ },
2385
+ {
2386
+ "epoch": 4.03527868852459,
2387
+ "grad_norm": 0.6015625,
2388
+ "learning_rate": 0.00028056246868276186,
2389
+ "loss": 1.699,
2390
+ "step": 17000
2391
+ },
2392
+ {
2393
+ "epoch": 4.035825136612022,
2394
+ "grad_norm": 0.5546875,
2395
+ "learning_rate": 0.0002804320758887403,
2396
+ "loss": 1.6939,
2397
+ "step": 17050
2398
+ },
2399
+ {
2400
+ "epoch": 4.036371584699453,
2401
+ "grad_norm": 0.498046875,
2402
+ "learning_rate": 0.000280301277708818,
2403
+ "loss": 1.7146,
2404
+ "step": 17100
2405
+ },
2406
+ {
2407
+ "epoch": 4.036918032786885,
2408
+ "grad_norm": 0.5859375,
2409
+ "learning_rate": 0.00028017007454951884,
2410
+ "loss": 1.7363,
2411
+ "step": 17150
2412
+ },
2413
+ {
2414
+ "epoch": 4.037464480874317,
2415
+ "grad_norm": 0.55078125,
2416
+ "learning_rate": 0.00028003846681862524,
2417
+ "loss": 1.6965,
2418
+ "step": 17200
2419
+ },
2420
+ {
2421
+ "epoch": 5.000382513661203,
2422
+ "grad_norm": 0.5703125,
2423
+ "learning_rate": 0.00027990645492517697,
2424
+ "loss": 1.6887,
2425
+ "step": 17250
2426
+ },
2427
+ {
2428
+ "epoch": 5.000928961748634,
2429
+ "grad_norm": 0.62109375,
2430
+ "learning_rate": 0.0002797740392794702,
2431
+ "loss": 1.6829,
2432
+ "step": 17300
2433
+ },
2434
+ {
2435
+ "epoch": 5.001475409836066,
2436
+ "grad_norm": 0.58984375,
2437
+ "learning_rate": 0.0002796412202930557,
2438
+ "loss": 1.6235,
2439
+ "step": 17350
2440
+ },
2441
+ {
2442
+ "epoch": 5.002021857923498,
2443
+ "grad_norm": 0.59375,
2444
+ "learning_rate": 0.00027950799837873794,
2445
+ "loss": 1.634,
2446
+ "step": 17400
2447
+ },
2448
+ {
2449
+ "epoch": 5.002568306010929,
2450
+ "grad_norm": 0.55859375,
2451
+ "learning_rate": 0.0002793743739505738,
2452
+ "loss": 1.5863,
2453
+ "step": 17450
2454
+ },
2455
+ {
2456
+ "epoch": 5.003114754098361,
2457
+ "grad_norm": 0.5546875,
2458
+ "learning_rate": 0.0002792403474238709,
2459
+ "loss": 1.6513,
2460
+ "step": 17500
2461
+ },
2462
+ {
2463
+ "epoch": 5.003661202185793,
2464
+ "grad_norm": 0.59375,
2465
+ "learning_rate": 0.000279105919215187,
2466
+ "loss": 1.6388,
2467
+ "step": 17550
2468
+ },
2469
+ {
2470
+ "epoch": 5.004207650273224,
2471
+ "grad_norm": 0.59375,
2472
+ "learning_rate": 0.00027897108974232797,
2473
+ "loss": 1.6302,
2474
+ "step": 17600
2475
+ },
2476
+ {
2477
+ "epoch": 5.004754098360656,
2478
+ "grad_norm": 1.1875,
2479
+ "learning_rate": 0.0002788358594243469,
2480
+ "loss": 1.6336,
2481
+ "step": 17650
2482
+ },
2483
+ {
2484
+ "epoch": 5.005300546448088,
2485
+ "grad_norm": 0.58984375,
2486
+ "learning_rate": 0.0002787002286815428,
2487
+ "loss": 1.6513,
2488
+ "step": 17700
2489
+ },
2490
+ {
2491
+ "epoch": 5.005846994535519,
2492
+ "grad_norm": 0.6015625,
2493
+ "learning_rate": 0.000278564197935459,
2494
+ "loss": 1.6185,
2495
+ "step": 17750
2496
+ },
2497
+ {
2498
+ "epoch": 5.006393442622951,
2499
+ "grad_norm": 0.58984375,
2500
+ "learning_rate": 0.00027842776760888236,
2501
+ "loss": 1.6366,
2502
+ "step": 17800
2503
+ },
2504
+ {
2505
+ "epoch": 5.006939890710383,
2506
+ "grad_norm": 0.578125,
2507
+ "learning_rate": 0.00027829093812584143,
2508
+ "loss": 1.6162,
2509
+ "step": 17850
2510
+ },
2511
+ {
2512
+ "epoch": 5.007486338797814,
2513
+ "grad_norm": 0.56640625,
2514
+ "learning_rate": 0.0002781537099116054,
2515
+ "loss": 1.6665,
2516
+ "step": 17900
2517
+ },
2518
+ {
2519
+ "epoch": 5.008032786885246,
2520
+ "grad_norm": 0.55859375,
2521
+ "learning_rate": 0.00027801608339268275,
2522
+ "loss": 1.6505,
2523
+ "step": 17950
2524
+ },
2525
+ {
2526
+ "epoch": 5.008579234972678,
2527
+ "grad_norm": 0.578125,
2528
+ "learning_rate": 0.00027787805899681976,
2529
+ "loss": 1.6079,
2530
+ "step": 18000
2531
  }
2532
  ],
2533
  "logging_steps": 50,
 
2547
  "attributes": {}
2548
  }
2549
  },
2550
+ "total_flos": 9.6261070880401e+18,
2551
  "train_batch_size": 16,
2552
  "trial_name": null,
2553
  "trial_params": null