Azrail commited on
Commit
68a014e
·
verified ·
1 Parent(s): cad0c24

Training in progress, step 14000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5b7373ac11401636769557d7c41bd131eaa1ff29f1ac0bd8ece04d73a85d45b3
3
  size 517931840
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:67b3b35db2c19f35fe025798e859f89450cb9547846af5202deac481cd7c5f41
3
  size 517931840
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:600ff1a38a47f869ae5492791562a9ea82c55e0368079b5f56587277995a7652
3
  size 1035661434
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6db4391958de2af60b776a710ac499c24b4612827a08c4c3d9596c220966f1b3
3
  size 1035661434
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aa45bf7feccf57a31c0d1db361074f3cc8988037f2a20ad89dd89a197a5582fe
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:703a2c772f49bb55a4740bd10b6f1adb07416bc938539fda0388f46713083aaa
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:314b286b574cdec8b8035ea2a5d06f7aaf8f954a409646e55b7a4304b27476aa
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3982f421b6a562fab23a5b9409962e3a2e613661137ac332f25f7e679b9669f
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.2855585621577244,
6
  "eval_steps": 500,
7
- "global_step": 13000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -2322,11 +2322,189 @@
2322
  "eval_steps_per_second": 18.93,
2323
  "num_input_tokens_seen": 13631488000,
2324
  "step": 13000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2325
  }
2326
  ],
2327
  "logging_steps": 50,
2328
  "max_steps": 200000,
2329
- "num_input_tokens_seen": 13631488000,
2330
  "num_train_epochs": 5,
2331
  "save_steps": 1000,
2332
  "stateful_callbacks": {
@@ -2341,7 +2519,7 @@
2341
  "attributes": {}
2342
  }
2343
  },
2344
- "total_flos": 7.763232307544064e+18,
2345
  "train_batch_size": 64,
2346
  "trial_name": null,
2347
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.3075246054006263,
6
  "eval_steps": 500,
7
+ "global_step": 14000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
2322
  "eval_steps_per_second": 18.93,
2323
  "num_input_tokens_seen": 13631488000,
2324
  "step": 13000
2325
+ },
2326
+ {
2327
+ "epoch": 0.28665686431986953,
2328
+ "grad_norm": 0.13635839521884918,
2329
+ "learning_rate": 0.001,
2330
+ "loss": 2.7705,
2331
+ "num_input_tokens_seen": 13683916800,
2332
+ "step": 13050
2333
+ },
2334
+ {
2335
+ "epoch": 0.2877551664820146,
2336
+ "grad_norm": 0.1449163854122162,
2337
+ "learning_rate": 0.001,
2338
+ "loss": 2.775,
2339
+ "num_input_tokens_seen": 13736345600,
2340
+ "step": 13100
2341
+ },
2342
+ {
2343
+ "epoch": 0.2888534686441597,
2344
+ "grad_norm": 0.1385536640882492,
2345
+ "learning_rate": 0.001,
2346
+ "loss": 2.7705,
2347
+ "num_input_tokens_seen": 13788774400,
2348
+ "step": 13150
2349
+ },
2350
+ {
2351
+ "epoch": 0.2899517708063048,
2352
+ "grad_norm": 0.14647842943668365,
2353
+ "learning_rate": 0.001,
2354
+ "loss": 2.7709,
2355
+ "num_input_tokens_seen": 13841203200,
2356
+ "step": 13200
2357
+ },
2358
+ {
2359
+ "epoch": 0.2910500729684499,
2360
+ "grad_norm": 0.14193060994148254,
2361
+ "learning_rate": 0.001,
2362
+ "loss": 2.7753,
2363
+ "num_input_tokens_seen": 13893632000,
2364
+ "step": 13250
2365
+ },
2366
+ {
2367
+ "epoch": 0.292148375130595,
2368
+ "grad_norm": 0.15065765380859375,
2369
+ "learning_rate": 0.001,
2370
+ "loss": 2.7725,
2371
+ "num_input_tokens_seen": 13946060800,
2372
+ "step": 13300
2373
+ },
2374
+ {
2375
+ "epoch": 0.29324667729274007,
2376
+ "grad_norm": 0.1726570725440979,
2377
+ "learning_rate": 0.001,
2378
+ "loss": 2.7677,
2379
+ "num_input_tokens_seen": 13998489600,
2380
+ "step": 13350
2381
+ },
2382
+ {
2383
+ "epoch": 0.2943449794548852,
2384
+ "grad_norm": 0.13577735424041748,
2385
+ "learning_rate": 0.001,
2386
+ "loss": 2.7661,
2387
+ "num_input_tokens_seen": 14050918400,
2388
+ "step": 13400
2389
+ },
2390
+ {
2391
+ "epoch": 0.29544328161703026,
2392
+ "grad_norm": 0.1286347657442093,
2393
+ "learning_rate": 0.001,
2394
+ "loss": 2.7642,
2395
+ "num_input_tokens_seen": 14103347200,
2396
+ "step": 13450
2397
+ },
2398
+ {
2399
+ "epoch": 0.2965415837791754,
2400
+ "grad_norm": 0.12374001741409302,
2401
+ "learning_rate": 0.001,
2402
+ "loss": 2.7651,
2403
+ "num_input_tokens_seen": 14155776000,
2404
+ "step": 13500
2405
+ },
2406
+ {
2407
+ "epoch": 0.2965415837791754,
2408
+ "eval_loss": 2.6711983680725098,
2409
+ "eval_runtime": 65.6737,
2410
+ "eval_samples_per_second": 76.134,
2411
+ "eval_steps_per_second": 19.033,
2412
+ "num_input_tokens_seen": 14155776000,
2413
+ "step": 13500
2414
+ },
2415
+ {
2416
+ "epoch": 0.29763988594132046,
2417
+ "grad_norm": 0.1733749508857727,
2418
+ "learning_rate": 0.001,
2419
+ "loss": 2.765,
2420
+ "num_input_tokens_seen": 14208204800,
2421
+ "step": 13550
2422
+ },
2423
+ {
2424
+ "epoch": 0.29873818810346553,
2425
+ "grad_norm": 0.1459003984928131,
2426
+ "learning_rate": 0.001,
2427
+ "loss": 2.7683,
2428
+ "num_input_tokens_seen": 14260633600,
2429
+ "step": 13600
2430
+ },
2431
+ {
2432
+ "epoch": 0.29983649026561066,
2433
+ "grad_norm": 0.1527784913778305,
2434
+ "learning_rate": 0.001,
2435
+ "loss": 2.7678,
2436
+ "num_input_tokens_seen": 14313062400,
2437
+ "step": 13650
2438
+ },
2439
+ {
2440
+ "epoch": 0.3009347924277557,
2441
+ "grad_norm": 0.1344996690750122,
2442
+ "learning_rate": 0.001,
2443
+ "loss": 2.7613,
2444
+ "num_input_tokens_seen": 14365491200,
2445
+ "step": 13700
2446
+ },
2447
+ {
2448
+ "epoch": 0.30203309458990085,
2449
+ "grad_norm": 0.1291748583316803,
2450
+ "learning_rate": 0.001,
2451
+ "loss": 2.7682,
2452
+ "num_input_tokens_seen": 14417920000,
2453
+ "step": 13750
2454
+ },
2455
+ {
2456
+ "epoch": 0.3031313967520459,
2457
+ "grad_norm": 0.1352360099554062,
2458
+ "learning_rate": 0.001,
2459
+ "loss": 2.764,
2460
+ "num_input_tokens_seen": 14470348800,
2461
+ "step": 13800
2462
+ },
2463
+ {
2464
+ "epoch": 0.30422969891419105,
2465
+ "grad_norm": 0.13686618208885193,
2466
+ "learning_rate": 0.001,
2467
+ "loss": 2.7638,
2468
+ "num_input_tokens_seen": 14522777600,
2469
+ "step": 13850
2470
+ },
2471
+ {
2472
+ "epoch": 0.3053280010763361,
2473
+ "grad_norm": 0.15377116203308105,
2474
+ "learning_rate": 0.001,
2475
+ "loss": 2.7639,
2476
+ "num_input_tokens_seen": 14575206400,
2477
+ "step": 13900
2478
+ },
2479
+ {
2480
+ "epoch": 0.3064263032384812,
2481
+ "grad_norm": 0.13904446363449097,
2482
+ "learning_rate": 0.001,
2483
+ "loss": 2.7666,
2484
+ "num_input_tokens_seen": 14627635200,
2485
+ "step": 13950
2486
+ },
2487
+ {
2488
+ "epoch": 0.3075246054006263,
2489
+ "grad_norm": 0.12402611970901489,
2490
+ "learning_rate": 0.001,
2491
+ "loss": 2.759,
2492
+ "num_input_tokens_seen": 14680064000,
2493
+ "step": 14000
2494
+ },
2495
+ {
2496
+ "epoch": 0.3075246054006263,
2497
+ "eval_loss": 2.6654388904571533,
2498
+ "eval_runtime": 65.2775,
2499
+ "eval_samples_per_second": 76.596,
2500
+ "eval_steps_per_second": 19.149,
2501
+ "num_input_tokens_seen": 14680064000,
2502
+ "step": 14000
2503
  }
2504
  ],
2505
  "logging_steps": 50,
2506
  "max_steps": 200000,
2507
+ "num_input_tokens_seen": 14680064000,
2508
  "num_train_epochs": 5,
2509
  "save_steps": 1000,
2510
  "stateful_callbacks": {
 
2519
  "attributes": {}
2520
  }
2521
  },
2522
+ "total_flos": 8.360404023508992e+18,
2523
  "train_batch_size": 64,
2524
  "trial_name": null,
2525
  "trial_params": null