Azrail commited on
Commit
9c9e234
·
verified ·
1 Parent(s): 9731cc8

Training in progress, step 13000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:20ce803e4fe3b56cd4360f4db9480f38ae31a8a1afff95c81fdf52f10ba0ac27
3
  size 150625560
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:23be2c11c244c72601ea6f47dd507781736231ff1da2289fe5f8ba433277cb99
3
  size 150625560
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:57fe321ef28d6e68df66c93386db17dd5a5a544cc930d8f0b18e8919d55948ac
3
  size 602335994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04943bdcad0923c88796f61e80a911b94cde9c121a1bb27006e82c8a584a0c44
3
  size 602335994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b9ed029202dee104daa8ff402cceceb0954b69804876f1536e7023f304ea17a5
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff84b2998c9ce4e6e3eaf03e775fc93a7c11be8195c0bb3abb7a8b9a1cec86e5
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:26e93f6b8bba63ca3cafe7523f1374f6bb0c88ef8963cfe1774bbd335ba7bcbe
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71a524f67e79e2b512d6d818f94e2b528e5b7f4447259f3966ae44cdba439db5
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 2.8966006306863616,
6
  "eval_steps": 500,
7
- "global_step": 12000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -2624,11 +2624,229 @@
2624
  "eval_steps_per_second": 20.507,
2625
  "num_input_tokens_seen": 5797304337,
2626
  "step": 12000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2627
  }
2628
  ],
2629
  "logging_steps": 50,
2630
  "max_steps": 16568,
2631
- "num_input_tokens_seen": 5797304337,
2632
  "num_train_epochs": 4,
2633
  "save_steps": 1000,
2634
  "stateful_callbacks": {
@@ -2643,7 +2861,7 @@
2643
  "attributes": {}
2644
  }
2645
  },
2646
- "total_flos": 1.550835491837829e+18,
2647
  "train_batch_size": 16,
2648
  "trial_name": null,
2649
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 3.1378457081642197,
6
  "eval_steps": 500,
7
+ "global_step": 13000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
2624
  "eval_steps_per_second": 20.507,
2625
  "num_input_tokens_seen": 5797304337,
2626
  "step": 12000
2627
+ },
2628
+ {
2629
+ "epoch": 2.908671183065015,
2630
+ "grad_norm": 0.2578125,
2631
+ "learning_rate": 1.704391127206881e-05,
2632
+ "loss": 2.0958,
2633
+ "mean_token_accuracy": 0.5546299646422267,
2634
+ "num_input_tokens_seen": 5821442385,
2635
+ "num_tokens": 2453453845.0,
2636
+ "step": 12050
2637
+ },
2638
+ {
2639
+ "epoch": 2.920741735443668,
2640
+ "grad_norm": 0.26171875,
2641
+ "learning_rate": 1.685528896936774e-05,
2642
+ "loss": 2.0926,
2643
+ "mean_token_accuracy": 0.5549974143505096,
2644
+ "num_input_tokens_seen": 5845686961,
2645
+ "num_tokens": 2463776050.0,
2646
+ "step": 12100
2647
+ },
2648
+ {
2649
+ "epoch": 2.9328122878223217,
2650
+ "grad_norm": 0.263671875,
2651
+ "learning_rate": 1.6666666666666667e-05,
2652
+ "loss": 2.1015,
2653
+ "mean_token_accuracy": 0.5541527543962002,
2654
+ "num_input_tokens_seen": 5869745137,
2655
+ "num_tokens": 2473828195.0,
2656
+ "step": 12150
2657
+ },
2658
+ {
2659
+ "epoch": 2.944882840200975,
2660
+ "grad_norm": 0.26953125,
2661
+ "learning_rate": 1.6478044363965596e-05,
2662
+ "loss": 2.1041,
2663
+ "mean_token_accuracy": 0.5541104365140199,
2664
+ "num_input_tokens_seen": 5893803025,
2665
+ "num_tokens": 2483915340.0,
2666
+ "step": 12200
2667
+ },
2668
+ {
2669
+ "epoch": 2.956953392579628,
2670
+ "grad_norm": 0.2333984375,
2671
+ "learning_rate": 1.6289422061264525e-05,
2672
+ "loss": 2.0922,
2673
+ "mean_token_accuracy": 0.5555301706120371,
2674
+ "num_input_tokens_seen": 5918068641,
2675
+ "num_tokens": 2494208052.0,
2676
+ "step": 12250
2677
+ },
2678
+ {
2679
+ "epoch": 2.9690239449582814,
2680
+ "grad_norm": 0.2490234375,
2681
+ "learning_rate": 1.6100799758563453e-05,
2682
+ "loss": 2.0938,
2683
+ "mean_token_accuracy": 0.5548020200431347,
2684
+ "num_input_tokens_seen": 5942257041,
2685
+ "num_tokens": 2504393372.0,
2686
+ "step": 12300
2687
+ },
2688
+ {
2689
+ "epoch": 2.9810944973369344,
2690
+ "grad_norm": 0.2890625,
2691
+ "learning_rate": 1.5912177455862382e-05,
2692
+ "loss": 2.0843,
2693
+ "mean_token_accuracy": 0.5566597804427147,
2694
+ "num_input_tokens_seen": 5966422081,
2695
+ "num_tokens": 2514627798.0,
2696
+ "step": 12350
2697
+ },
2698
+ {
2699
+ "epoch": 2.9931650497155875,
2700
+ "grad_norm": 0.2734375,
2701
+ "learning_rate": 1.572355515316131e-05,
2702
+ "loss": 2.0886,
2703
+ "mean_token_accuracy": 0.5566082544624805,
2704
+ "num_input_tokens_seen": 5990574321,
2705
+ "num_tokens": 2524805007.0,
2706
+ "step": 12400
2707
+ },
2708
+ {
2709
+ "epoch": 3.005069631999034,
2710
+ "grad_norm": 0.26171875,
2711
+ "learning_rate": 1.553493285046024e-05,
2712
+ "loss": 2.1001,
2713
+ "mean_token_accuracy": 0.5549402527407246,
2714
+ "num_input_tokens_seen": 6014380145,
2715
+ "num_tokens": 2534738802.0,
2716
+ "step": 12450
2717
+ },
2718
+ {
2719
+ "epoch": 3.0171401843776877,
2720
+ "grad_norm": 0.2314453125,
2721
+ "learning_rate": 1.5346310547759168e-05,
2722
+ "loss": 2.092,
2723
+ "num_input_tokens_seen": 6038556753,
2724
+ "step": 12500
2725
+ },
2726
+ {
2727
+ "epoch": 3.0171401843776877,
2728
+ "eval_loss": 1.9681233167648315,
2729
+ "eval_mean_token_accuracy": 0.5784891846355349,
2730
+ "eval_num_tokens": 2544886437.0,
2731
+ "eval_runtime": 130.6689,
2732
+ "eval_samples_per_second": 81.978,
2733
+ "eval_steps_per_second": 20.495,
2734
+ "num_input_tokens_seen": 6038556753,
2735
+ "step": 12500
2736
+ },
2737
+ {
2738
+ "epoch": 3.029210736756341,
2739
+ "grad_norm": 0.25390625,
2740
+ "learning_rate": 1.5157688245058096e-05,
2741
+ "loss": 2.0925,
2742
+ "mean_token_accuracy": 0.5550393326207995,
2743
+ "num_input_tokens_seen": 6062857617,
2744
+ "num_tokens": 2555112151.0,
2745
+ "step": 12550
2746
+ },
2747
+ {
2748
+ "epoch": 3.041281289134994,
2749
+ "grad_norm": 0.38671875,
2750
+ "learning_rate": 1.4969065942357025e-05,
2751
+ "loss": 2.0957,
2752
+ "mean_token_accuracy": 0.5551863227039575,
2753
+ "num_input_tokens_seen": 6087077841,
2754
+ "num_tokens": 2565388515.0,
2755
+ "step": 12600
2756
+ },
2757
+ {
2758
+ "epoch": 3.0533518415136474,
2759
+ "grad_norm": 0.279296875,
2760
+ "learning_rate": 1.4780443639655952e-05,
2761
+ "loss": 2.0858,
2762
+ "mean_token_accuracy": 0.5563259933143854,
2763
+ "num_input_tokens_seen": 6111161617,
2764
+ "num_tokens": 2575504513.0,
2765
+ "step": 12650
2766
+ },
2767
+ {
2768
+ "epoch": 3.0654223938923004,
2769
+ "grad_norm": 0.25,
2770
+ "learning_rate": 1.4591821336954884e-05,
2771
+ "loss": 2.101,
2772
+ "mean_token_accuracy": 0.5549140437319875,
2773
+ "num_input_tokens_seen": 6135170369,
2774
+ "num_tokens": 2585570498.0,
2775
+ "step": 12700
2776
+ },
2777
+ {
2778
+ "epoch": 3.077492946270954,
2779
+ "grad_norm": 0.263671875,
2780
+ "learning_rate": 1.4403199034253811e-05,
2781
+ "loss": 2.0935,
2782
+ "mean_token_accuracy": 0.5543564364686608,
2783
+ "num_input_tokens_seen": 6159397985,
2784
+ "num_tokens": 2595740107.0,
2785
+ "step": 12750
2786
+ },
2787
+ {
2788
+ "epoch": 3.089563498649607,
2789
+ "grad_norm": 0.265625,
2790
+ "learning_rate": 1.421457673155274e-05,
2791
+ "loss": 2.0928,
2792
+ "mean_token_accuracy": 0.5548016136884689,
2793
+ "num_input_tokens_seen": 6183511137,
2794
+ "num_tokens": 2605900153.0,
2795
+ "step": 12800
2796
+ },
2797
+ {
2798
+ "epoch": 3.10163405102826,
2799
+ "grad_norm": 0.2890625,
2800
+ "learning_rate": 1.4025954428851668e-05,
2801
+ "loss": 2.0862,
2802
+ "mean_token_accuracy": 0.5555924268066883,
2803
+ "num_input_tokens_seen": 6207630993,
2804
+ "num_tokens": 2616105592.0,
2805
+ "step": 12850
2806
+ },
2807
+ {
2808
+ "epoch": 3.1137046034069136,
2809
+ "grad_norm": 0.248046875,
2810
+ "learning_rate": 1.3837332126150595e-05,
2811
+ "loss": 2.0938,
2812
+ "mean_token_accuracy": 0.554584386125207,
2813
+ "num_input_tokens_seen": 6231763217,
2814
+ "num_tokens": 2626268256.0,
2815
+ "step": 12900
2816
+ },
2817
+ {
2818
+ "epoch": 3.1257751557855666,
2819
+ "grad_norm": 0.251953125,
2820
+ "learning_rate": 1.3648709823449527e-05,
2821
+ "loss": 2.1042,
2822
+ "mean_token_accuracy": 0.553115917481482,
2823
+ "num_input_tokens_seen": 6255995041,
2824
+ "num_tokens": 2636461653.0,
2825
+ "step": 12950
2826
+ },
2827
+ {
2828
+ "epoch": 3.1378457081642197,
2829
+ "grad_norm": 0.25390625,
2830
+ "learning_rate": 1.3460087520748454e-05,
2831
+ "loss": 2.0952,
2832
+ "num_input_tokens_seen": 6280158129,
2833
+ "step": 13000
2834
+ },
2835
+ {
2836
+ "epoch": 3.1378457081642197,
2837
+ "eval_loss": 1.9681209325790405,
2838
+ "eval_mean_token_accuracy": 0.5785721040555485,
2839
+ "eval_num_tokens": 2646712354.0,
2840
+ "eval_runtime": 130.3881,
2841
+ "eval_samples_per_second": 82.155,
2842
+ "eval_steps_per_second": 20.539,
2843
+ "num_input_tokens_seen": 6280158129,
2844
+ "step": 13000
2845
  }
2846
  ],
2847
  "logging_steps": 50,
2848
  "max_steps": 16568,
2849
+ "num_input_tokens_seen": 6280158129,
2850
  "num_train_epochs": 4,
2851
  "save_steps": 1000,
2852
  "stateful_callbacks": {
 
2861
  "attributes": {}
2862
  }
2863
  },
2864
+ "total_flos": 1.680003593850839e+18,
2865
  "train_batch_size": 16,
2866
  "trial_name": null,
2867
  "trial_params": null