Azrail commited on
Commit
3b114ee
·
verified ·
1 Parent(s): 99dd611

Training in progress, step 15000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:67b3b35db2c19f35fe025798e859f89450cb9547846af5202deac481cd7c5f41
3
  size 517931840
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fdf1b0f1f31678e02e392db01936d097de602c17608d494f22362854ea1faea3
3
  size 517931840
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6db4391958de2af60b776a710ac499c24b4612827a08c4c3d9596c220966f1b3
3
  size 1035661434
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e719c4342dca47d074b491692ded6689afd8a826c27c44b36fe769c38219ad92
3
  size 1035661434
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:703a2c772f49bb55a4740bd10b6f1adb07416bc938539fda0388f46713083aaa
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed76fec8b31c184dac30ebd8181dfe95aa10c557692428e198df8bc24024a3d1
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b3982f421b6a562fab23a5b9409962e3a2e613661137ac332f25f7e679b9669f
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e2f83db9058ff0e3a2778afbea4452d3483a420d1f349a8a276a60ee0edb90fc
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.3075246054006263,
6
  "eval_steps": 500,
7
- "global_step": 14000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -2500,11 +2500,189 @@
2500
  "eval_steps_per_second": 19.149,
2501
  "num_input_tokens_seen": 14680064000,
2502
  "step": 14000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2503
  }
2504
  ],
2505
  "logging_steps": 50,
2506
  "max_steps": 200000,
2507
- "num_input_tokens_seen": 14680064000,
2508
  "num_train_epochs": 5,
2509
  "save_steps": 1000,
2510
  "stateful_callbacks": {
@@ -2519,7 +2697,7 @@
2519
  "attributes": {}
2520
  }
2521
  },
2522
- "total_flos": 8.360404023508992e+18,
2523
  "train_batch_size": 64,
2524
  "trial_name": null,
2525
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.3294906486435282,
6
  "eval_steps": 500,
7
+ "global_step": 15000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
2500
  "eval_steps_per_second": 19.149,
2501
  "num_input_tokens_seen": 14680064000,
2502
  "step": 14000
2503
+ },
2504
+ {
2505
+ "epoch": 0.3086229075627714,
2506
+ "grad_norm": 0.13326038420200348,
2507
+ "learning_rate": 0.001,
2508
+ "loss": 2.7622,
2509
+ "num_input_tokens_seen": 14732492800,
2510
+ "step": 14050
2511
+ },
2512
+ {
2513
+ "epoch": 0.3097212097249165,
2514
+ "grad_norm": 0.14305976033210754,
2515
+ "learning_rate": 0.001,
2516
+ "loss": 2.7597,
2517
+ "num_input_tokens_seen": 14784921600,
2518
+ "step": 14100
2519
+ },
2520
+ {
2521
+ "epoch": 0.3108195118870616,
2522
+ "grad_norm": 0.1182415783405304,
2523
+ "learning_rate": 0.001,
2524
+ "loss": 2.758,
2525
+ "num_input_tokens_seen": 14837350400,
2526
+ "step": 14150
2527
+ },
2528
+ {
2529
+ "epoch": 0.3119178140492067,
2530
+ "grad_norm": 0.12919387221336365,
2531
+ "learning_rate": 0.001,
2532
+ "loss": 2.759,
2533
+ "num_input_tokens_seen": 14889779200,
2534
+ "step": 14200
2535
+ },
2536
+ {
2537
+ "epoch": 0.3130161162113518,
2538
+ "grad_norm": 0.1420537382364273,
2539
+ "learning_rate": 0.001,
2540
+ "loss": 2.7519,
2541
+ "num_input_tokens_seen": 14942208000,
2542
+ "step": 14250
2543
+ },
2544
+ {
2545
+ "epoch": 0.31411441837349685,
2546
+ "grad_norm": 0.14349806308746338,
2547
+ "learning_rate": 0.001,
2548
+ "loss": 2.7653,
2549
+ "num_input_tokens_seen": 14994636800,
2550
+ "step": 14300
2551
+ },
2552
+ {
2553
+ "epoch": 0.315212720535642,
2554
+ "grad_norm": 0.16453324258327484,
2555
+ "learning_rate": 0.001,
2556
+ "loss": 2.7642,
2557
+ "num_input_tokens_seen": 15047065600,
2558
+ "step": 14350
2559
+ },
2560
+ {
2561
+ "epoch": 0.31631102269778705,
2562
+ "grad_norm": 0.11806487292051315,
2563
+ "learning_rate": 0.001,
2564
+ "loss": 2.7605,
2565
+ "num_input_tokens_seen": 15099494400,
2566
+ "step": 14400
2567
+ },
2568
+ {
2569
+ "epoch": 0.3174093248599322,
2570
+ "grad_norm": 0.12850746512413025,
2571
+ "learning_rate": 0.001,
2572
+ "loss": 2.7539,
2573
+ "num_input_tokens_seen": 15151923200,
2574
+ "step": 14450
2575
+ },
2576
+ {
2577
+ "epoch": 0.31850762702207724,
2578
+ "grad_norm": 0.1480904221534729,
2579
+ "learning_rate": 0.001,
2580
+ "loss": 2.7574,
2581
+ "num_input_tokens_seen": 15204352000,
2582
+ "step": 14500
2583
+ },
2584
+ {
2585
+ "epoch": 0.31850762702207724,
2586
+ "eval_loss": 2.6607398986816406,
2587
+ "eval_runtime": 65.6281,
2588
+ "eval_samples_per_second": 76.187,
2589
+ "eval_steps_per_second": 19.047,
2590
+ "num_input_tokens_seen": 15204352000,
2591
+ "step": 14500
2592
+ },
2593
+ {
2594
+ "epoch": 0.3196059291842223,
2595
+ "grad_norm": 0.13606210052967072,
2596
+ "learning_rate": 0.001,
2597
+ "loss": 2.763,
2598
+ "num_input_tokens_seen": 15256780800,
2599
+ "step": 14550
2600
+ },
2601
+ {
2602
+ "epoch": 0.32070423134636744,
2603
+ "grad_norm": 0.12546846270561218,
2604
+ "learning_rate": 0.001,
2605
+ "loss": 2.7556,
2606
+ "num_input_tokens_seen": 15309209600,
2607
+ "step": 14600
2608
+ },
2609
+ {
2610
+ "epoch": 0.3218025335085125,
2611
+ "grad_norm": 0.1267230361700058,
2612
+ "learning_rate": 0.001,
2613
+ "loss": 2.7617,
2614
+ "num_input_tokens_seen": 15361638400,
2615
+ "step": 14650
2616
+ },
2617
+ {
2618
+ "epoch": 0.32290083567065764,
2619
+ "grad_norm": 0.13812699913978577,
2620
+ "learning_rate": 0.001,
2621
+ "loss": 2.7533,
2622
+ "num_input_tokens_seen": 15414067200,
2623
+ "step": 14700
2624
+ },
2625
+ {
2626
+ "epoch": 0.3239991378328027,
2627
+ "grad_norm": 0.12577973306179047,
2628
+ "learning_rate": 0.001,
2629
+ "loss": 2.7519,
2630
+ "num_input_tokens_seen": 15466496000,
2631
+ "step": 14750
2632
+ },
2633
+ {
2634
+ "epoch": 0.32509743999494783,
2635
+ "grad_norm": 0.14296036958694458,
2636
+ "learning_rate": 0.001,
2637
+ "loss": 2.7479,
2638
+ "num_input_tokens_seen": 15518924800,
2639
+ "step": 14800
2640
+ },
2641
+ {
2642
+ "epoch": 0.3261957421570929,
2643
+ "grad_norm": 0.12737593054771423,
2644
+ "learning_rate": 0.001,
2645
+ "loss": 2.7546,
2646
+ "num_input_tokens_seen": 15571353600,
2647
+ "step": 14850
2648
+ },
2649
+ {
2650
+ "epoch": 0.327294044319238,
2651
+ "grad_norm": 0.1349722445011139,
2652
+ "learning_rate": 0.001,
2653
+ "loss": 2.7477,
2654
+ "num_input_tokens_seen": 15623782400,
2655
+ "step": 14900
2656
+ },
2657
+ {
2658
+ "epoch": 0.3283923464813831,
2659
+ "grad_norm": 0.12827487289905548,
2660
+ "learning_rate": 0.001,
2661
+ "loss": 2.7492,
2662
+ "num_input_tokens_seen": 15676211200,
2663
+ "step": 14950
2664
+ },
2665
+ {
2666
+ "epoch": 0.3294906486435282,
2667
+ "grad_norm": 0.13282813131809235,
2668
+ "learning_rate": 0.001,
2669
+ "loss": 2.7466,
2670
+ "num_input_tokens_seen": 15728640000,
2671
+ "step": 15000
2672
+ },
2673
+ {
2674
+ "epoch": 0.3294906486435282,
2675
+ "eval_loss": 2.6524744033813477,
2676
+ "eval_runtime": 65.8996,
2677
+ "eval_samples_per_second": 75.873,
2678
+ "eval_steps_per_second": 18.968,
2679
+ "num_input_tokens_seen": 15728640000,
2680
+ "step": 15000
2681
  }
2682
  ],
2683
  "logging_steps": 50,
2684
  "max_steps": 200000,
2685
+ "num_input_tokens_seen": 15728640000,
2686
  "num_train_epochs": 5,
2687
  "save_steps": 1000,
2688
  "stateful_callbacks": {
 
2697
  "attributes": {}
2698
  }
2699
  },
2700
+ "total_flos": 8.95757573947392e+18,
2701
  "train_batch_size": 64,
2702
  "trial_name": null,
2703
  "trial_params": null