kiritan commited on
Commit
f8bee8e
·
verified ·
1 Parent(s): 42a06f2

Training in progress, step 11000, checkpoint

Browse files
last-checkpoint/global_step11000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a8e043ccfd2d14162108046118260c4a11838198a3378b8c63aef14e884f315e
3
+ size 5117197489
last-checkpoint/global_step11000/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45f6d58392e57f60153009c206c846732ef428fe79bd9f765140b63722b1c39e
3
+ size 859127933
last-checkpoint/latest CHANGED
@@ -1 +1 @@
1
- global_step9000
 
1
+ global_step11000
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aef828c688fc4b40c4f970b4f1621324009e1b6de86d3a3ed65007b337b7f7e7
3
  size 962205216
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:55de6622ea2c12f2865659952fae3e7645ab102a38297690cb4fdbaeb6a9d78f
3
  size 962205216
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:64e572f1314b9da8f922a0fbf0c91986e4b7b809f9a1dbb178f491f4b7541f4c
3
  size 14709
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a42b77849766d934d44019f3aaacdcb7addb89613853b8085a0f3dbdc6ec32df
3
  size 14709
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ec7cb829bad4c5e40215f974eb8875988bba1a68c4193a01021b2b11b0d8359f
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:493d0f530ff7fc5bb7b7e09a1475f8ed1e6010e09c7b8eee02f261c6c00502eb
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 84.13012729844414,
3
- "best_model_checkpoint": "./iteboshi_student_model_temp/checkpoint-7000",
4
- "epoch": 9.911894273127754,
5
  "eval_steps": 1000,
6
- "global_step": 9000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -2617,6 +2617,586 @@
2617
  "eval_steps_per_second": 2.033,
2618
  "eval_wer": 84.72418670438473,
2619
  "step": 9000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2620
  }
2621
  ],
2622
  "logging_steps": 25,
@@ -2636,7 +3216,7 @@
2636
  "attributes": {}
2637
  }
2638
  },
2639
- "total_flos": 1.546328867841088e+20,
2640
  "train_batch_size": 4,
2641
  "trial_name": null,
2642
  "trial_params": null
 
1
  {
2
+ "best_metric": 83.86610089580387,
3
+ "best_model_checkpoint": "./iteboshi_student_model_temp/checkpoint-11000",
4
+ "epoch": 12.114537444933921,
5
  "eval_steps": 1000,
6
+ "global_step": 11000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
2617
  "eval_steps_per_second": 2.033,
2618
  "eval_wer": 84.72418670438473,
2619
  "step": 9000
2620
+ },
2621
+ {
2622
+ "epoch": 9.939427312775331,
2623
+ "grad_norm": 0.9093891382217407,
2624
+ "learning_rate": 1.1256410256410258e-05,
2625
+ "loss": 0.0354,
2626
+ "step": 9025
2627
+ },
2628
+ {
2629
+ "epoch": 9.966960352422907,
2630
+ "grad_norm": 0.526305615901947,
2631
+ "learning_rate": 1.1230769230769232e-05,
2632
+ "loss": 0.04,
2633
+ "step": 9050
2634
+ },
2635
+ {
2636
+ "epoch": 9.994493392070485,
2637
+ "grad_norm": 0.4748174846172333,
2638
+ "learning_rate": 1.1205128205128205e-05,
2639
+ "loss": 0.0405,
2640
+ "step": 9075
2641
+ },
2642
+ {
2643
+ "epoch": 10.022026431718063,
2644
+ "grad_norm": 0.23602962493896484,
2645
+ "learning_rate": 1.117948717948718e-05,
2646
+ "loss": 0.0245,
2647
+ "step": 9100
2648
+ },
2649
+ {
2650
+ "epoch": 10.049559471365638,
2651
+ "grad_norm": 0.2989708185195923,
2652
+ "learning_rate": 1.1153846153846154e-05,
2653
+ "loss": 0.0231,
2654
+ "step": 9125
2655
+ },
2656
+ {
2657
+ "epoch": 10.077092511013216,
2658
+ "grad_norm": 0.34653839468955994,
2659
+ "learning_rate": 1.112820512820513e-05,
2660
+ "loss": 0.0306,
2661
+ "step": 9150
2662
+ },
2663
+ {
2664
+ "epoch": 10.104625550660794,
2665
+ "grad_norm": 0.4413544535636902,
2666
+ "learning_rate": 1.1102564102564103e-05,
2667
+ "loss": 0.0242,
2668
+ "step": 9175
2669
+ },
2670
+ {
2671
+ "epoch": 10.13215859030837,
2672
+ "grad_norm": 0.44882041215896606,
2673
+ "learning_rate": 1.1076923076923079e-05,
2674
+ "loss": 0.036,
2675
+ "step": 9200
2676
+ },
2677
+ {
2678
+ "epoch": 10.159691629955947,
2679
+ "grad_norm": 0.049951497465372086,
2680
+ "learning_rate": 1.1051282051282052e-05,
2681
+ "loss": 0.0249,
2682
+ "step": 9225
2683
+ },
2684
+ {
2685
+ "epoch": 10.187224669603523,
2686
+ "grad_norm": 0.34928587079048157,
2687
+ "learning_rate": 1.1025641025641028e-05,
2688
+ "loss": 0.0322,
2689
+ "step": 9250
2690
+ },
2691
+ {
2692
+ "epoch": 10.214757709251101,
2693
+ "grad_norm": 0.18765118718147278,
2694
+ "learning_rate": 1.1000000000000001e-05,
2695
+ "loss": 0.0249,
2696
+ "step": 9275
2697
+ },
2698
+ {
2699
+ "epoch": 10.242290748898679,
2700
+ "grad_norm": 0.09570558369159698,
2701
+ "learning_rate": 1.0974358974358977e-05,
2702
+ "loss": 0.0241,
2703
+ "step": 9300
2704
+ },
2705
+ {
2706
+ "epoch": 10.269823788546255,
2707
+ "grad_norm": 0.36708030104637146,
2708
+ "learning_rate": 1.094871794871795e-05,
2709
+ "loss": 0.0267,
2710
+ "step": 9325
2711
+ },
2712
+ {
2713
+ "epoch": 10.297356828193832,
2714
+ "grad_norm": 0.6306156516075134,
2715
+ "learning_rate": 1.0923076923076922e-05,
2716
+ "loss": 0.028,
2717
+ "step": 9350
2718
+ },
2719
+ {
2720
+ "epoch": 10.32488986784141,
2721
+ "grad_norm": 0.47958239912986755,
2722
+ "learning_rate": 1.0897435897435898e-05,
2723
+ "loss": 0.0374,
2724
+ "step": 9375
2725
+ },
2726
+ {
2727
+ "epoch": 10.352422907488986,
2728
+ "grad_norm": 0.5049773454666138,
2729
+ "learning_rate": 1.0871794871794871e-05,
2730
+ "loss": 0.0252,
2731
+ "step": 9400
2732
+ },
2733
+ {
2734
+ "epoch": 10.379955947136564,
2735
+ "grad_norm": 0.18035492300987244,
2736
+ "learning_rate": 1.0846153846153847e-05,
2737
+ "loss": 0.032,
2738
+ "step": 9425
2739
+ },
2740
+ {
2741
+ "epoch": 10.407488986784141,
2742
+ "grad_norm": 0.40862882137298584,
2743
+ "learning_rate": 1.082051282051282e-05,
2744
+ "loss": 0.0317,
2745
+ "step": 9450
2746
+ },
2747
+ {
2748
+ "epoch": 10.435022026431717,
2749
+ "grad_norm": 0.4345795512199402,
2750
+ "learning_rate": 1.0794871794871796e-05,
2751
+ "loss": 0.0227,
2752
+ "step": 9475
2753
+ },
2754
+ {
2755
+ "epoch": 10.462555066079295,
2756
+ "grad_norm": 0.32652077078819275,
2757
+ "learning_rate": 1.076923076923077e-05,
2758
+ "loss": 0.0274,
2759
+ "step": 9500
2760
+ },
2761
+ {
2762
+ "epoch": 10.490088105726873,
2763
+ "grad_norm": 0.49059435725212097,
2764
+ "learning_rate": 1.0743589743589745e-05,
2765
+ "loss": 0.0336,
2766
+ "step": 9525
2767
+ },
2768
+ {
2769
+ "epoch": 10.517621145374449,
2770
+ "grad_norm": 0.14571261405944824,
2771
+ "learning_rate": 1.0717948717948718e-05,
2772
+ "loss": 0.0244,
2773
+ "step": 9550
2774
+ },
2775
+ {
2776
+ "epoch": 10.545154185022026,
2777
+ "grad_norm": 0.2149128019809723,
2778
+ "learning_rate": 1.0692307692307694e-05,
2779
+ "loss": 0.0252,
2780
+ "step": 9575
2781
+ },
2782
+ {
2783
+ "epoch": 10.572687224669604,
2784
+ "grad_norm": 0.20995257794857025,
2785
+ "learning_rate": 1.0666666666666667e-05,
2786
+ "loss": 0.0311,
2787
+ "step": 9600
2788
+ },
2789
+ {
2790
+ "epoch": 10.60022026431718,
2791
+ "grad_norm": 0.4227479100227356,
2792
+ "learning_rate": 1.0641025641025643e-05,
2793
+ "loss": 0.0261,
2794
+ "step": 9625
2795
+ },
2796
+ {
2797
+ "epoch": 10.627753303964758,
2798
+ "grad_norm": 0.1345728039741516,
2799
+ "learning_rate": 1.0615384615384616e-05,
2800
+ "loss": 0.026,
2801
+ "step": 9650
2802
+ },
2803
+ {
2804
+ "epoch": 10.655286343612335,
2805
+ "grad_norm": 0.5568249821662903,
2806
+ "learning_rate": 1.058974358974359e-05,
2807
+ "loss": 0.0275,
2808
+ "step": 9675
2809
+ },
2810
+ {
2811
+ "epoch": 10.682819383259911,
2812
+ "grad_norm": 0.5649207234382629,
2813
+ "learning_rate": 1.0564102564102565e-05,
2814
+ "loss": 0.03,
2815
+ "step": 9700
2816
+ },
2817
+ {
2818
+ "epoch": 10.710352422907489,
2819
+ "grad_norm": 0.23224163055419922,
2820
+ "learning_rate": 1.0538461538461539e-05,
2821
+ "loss": 0.0292,
2822
+ "step": 9725
2823
+ },
2824
+ {
2825
+ "epoch": 10.737885462555067,
2826
+ "grad_norm": 0.2227552831172943,
2827
+ "learning_rate": 1.0512820512820514e-05,
2828
+ "loss": 0.028,
2829
+ "step": 9750
2830
+ },
2831
+ {
2832
+ "epoch": 10.765418502202643,
2833
+ "grad_norm": 0.07342702895402908,
2834
+ "learning_rate": 1.0487179487179488e-05,
2835
+ "loss": 0.0227,
2836
+ "step": 9775
2837
+ },
2838
+ {
2839
+ "epoch": 10.79295154185022,
2840
+ "grad_norm": 0.3385262191295624,
2841
+ "learning_rate": 1.0461538461538463e-05,
2842
+ "loss": 0.0325,
2843
+ "step": 9800
2844
+ },
2845
+ {
2846
+ "epoch": 10.820484581497798,
2847
+ "grad_norm": 0.2666647434234619,
2848
+ "learning_rate": 1.0435897435897437e-05,
2849
+ "loss": 0.0264,
2850
+ "step": 9825
2851
+ },
2852
+ {
2853
+ "epoch": 10.848017621145374,
2854
+ "grad_norm": 0.13147205114364624,
2855
+ "learning_rate": 1.0410256410256412e-05,
2856
+ "loss": 0.0184,
2857
+ "step": 9850
2858
+ },
2859
+ {
2860
+ "epoch": 10.875550660792952,
2861
+ "grad_norm": 0.24823608994483948,
2862
+ "learning_rate": 1.0384615384615386e-05,
2863
+ "loss": 0.0249,
2864
+ "step": 9875
2865
+ },
2866
+ {
2867
+ "epoch": 10.90308370044053,
2868
+ "grad_norm": 0.265788197517395,
2869
+ "learning_rate": 1.0358974358974361e-05,
2870
+ "loss": 0.0217,
2871
+ "step": 9900
2872
+ },
2873
+ {
2874
+ "epoch": 10.930616740088105,
2875
+ "grad_norm": 0.2914508879184723,
2876
+ "learning_rate": 1.0333333333333335e-05,
2877
+ "loss": 0.0199,
2878
+ "step": 9925
2879
+ },
2880
+ {
2881
+ "epoch": 10.958149779735683,
2882
+ "grad_norm": 0.19100092351436615,
2883
+ "learning_rate": 1.0307692307692307e-05,
2884
+ "loss": 0.0232,
2885
+ "step": 9950
2886
+ },
2887
+ {
2888
+ "epoch": 10.98568281938326,
2889
+ "grad_norm": 0.2141091227531433,
2890
+ "learning_rate": 1.0282051282051282e-05,
2891
+ "loss": 0.0276,
2892
+ "step": 9975
2893
+ },
2894
+ {
2895
+ "epoch": 11.013215859030836,
2896
+ "grad_norm": 0.09335622936487198,
2897
+ "learning_rate": 1.0256410256410256e-05,
2898
+ "loss": 0.0186,
2899
+ "step": 10000
2900
+ },
2901
+ {
2902
+ "epoch": 11.013215859030836,
2903
+ "eval_cer": 25.171093508190705,
2904
+ "eval_loss": 0.8366118669509888,
2905
+ "eval_runtime": 1307.8053,
2906
+ "eval_samples_per_second": 8.091,
2907
+ "eval_steps_per_second": 2.023,
2908
+ "eval_wer": 84.47901933050449,
2909
+ "step": 10000
2910
+ },
2911
+ {
2912
+ "epoch": 11.040748898678414,
2913
+ "grad_norm": 0.29987862706184387,
2914
+ "learning_rate": 1.0230769230769231e-05,
2915
+ "loss": 0.0117,
2916
+ "step": 10025
2917
+ },
2918
+ {
2919
+ "epoch": 11.068281938325992,
2920
+ "grad_norm": 0.22261077165603638,
2921
+ "learning_rate": 1.0205128205128205e-05,
2922
+ "loss": 0.0199,
2923
+ "step": 10050
2924
+ },
2925
+ {
2926
+ "epoch": 11.095814977973568,
2927
+ "grad_norm": 0.7212164402008057,
2928
+ "learning_rate": 1.017948717948718e-05,
2929
+ "loss": 0.0194,
2930
+ "step": 10075
2931
+ },
2932
+ {
2933
+ "epoch": 11.123348017621145,
2934
+ "grad_norm": 0.18654099106788635,
2935
+ "learning_rate": 1.0153846153846154e-05,
2936
+ "loss": 0.0191,
2937
+ "step": 10100
2938
+ },
2939
+ {
2940
+ "epoch": 11.150881057268723,
2941
+ "grad_norm": 0.1351199895143509,
2942
+ "learning_rate": 1.012820512820513e-05,
2943
+ "loss": 0.0151,
2944
+ "step": 10125
2945
+ },
2946
+ {
2947
+ "epoch": 11.178414096916299,
2948
+ "grad_norm": 0.24383758008480072,
2949
+ "learning_rate": 1.0102564102564103e-05,
2950
+ "loss": 0.0142,
2951
+ "step": 10150
2952
+ },
2953
+ {
2954
+ "epoch": 11.205947136563877,
2955
+ "grad_norm": 0.1962803304195404,
2956
+ "learning_rate": 1.0076923076923078e-05,
2957
+ "loss": 0.0159,
2958
+ "step": 10175
2959
+ },
2960
+ {
2961
+ "epoch": 11.233480176211454,
2962
+ "grad_norm": 0.1277613639831543,
2963
+ "learning_rate": 1.0051282051282052e-05,
2964
+ "loss": 0.018,
2965
+ "step": 10200
2966
+ },
2967
+ {
2968
+ "epoch": 11.26101321585903,
2969
+ "grad_norm": 0.17365778982639313,
2970
+ "learning_rate": 1.0025641025641027e-05,
2971
+ "loss": 0.0198,
2972
+ "step": 10225
2973
+ },
2974
+ {
2975
+ "epoch": 11.288546255506608,
2976
+ "grad_norm": 0.5494518876075745,
2977
+ "learning_rate": 1e-05,
2978
+ "loss": 0.0157,
2979
+ "step": 10250
2980
+ },
2981
+ {
2982
+ "epoch": 11.316079295154186,
2983
+ "grad_norm": 0.11686886101961136,
2984
+ "learning_rate": 9.974358974358974e-06,
2985
+ "loss": 0.024,
2986
+ "step": 10275
2987
+ },
2988
+ {
2989
+ "epoch": 11.343612334801762,
2990
+ "grad_norm": 0.15467554330825806,
2991
+ "learning_rate": 9.94871794871795e-06,
2992
+ "loss": 0.0174,
2993
+ "step": 10300
2994
+ },
2995
+ {
2996
+ "epoch": 11.37114537444934,
2997
+ "grad_norm": 0.10721301287412643,
2998
+ "learning_rate": 9.923076923076923e-06,
2999
+ "loss": 0.0169,
3000
+ "step": 10325
3001
+ },
3002
+ {
3003
+ "epoch": 11.398678414096917,
3004
+ "grad_norm": 0.1287498027086258,
3005
+ "learning_rate": 9.897435897435899e-06,
3006
+ "loss": 0.0202,
3007
+ "step": 10350
3008
+ },
3009
+ {
3010
+ "epoch": 11.426211453744493,
3011
+ "grad_norm": 0.4366730749607086,
3012
+ "learning_rate": 9.871794871794872e-06,
3013
+ "loss": 0.0166,
3014
+ "step": 10375
3015
+ },
3016
+ {
3017
+ "epoch": 11.45374449339207,
3018
+ "grad_norm": 0.12972579896450043,
3019
+ "learning_rate": 9.846153846153848e-06,
3020
+ "loss": 0.0177,
3021
+ "step": 10400
3022
+ },
3023
+ {
3024
+ "epoch": 11.481277533039648,
3025
+ "grad_norm": 0.810859203338623,
3026
+ "learning_rate": 9.820512820512821e-06,
3027
+ "loss": 0.0173,
3028
+ "step": 10425
3029
+ },
3030
+ {
3031
+ "epoch": 11.508810572687224,
3032
+ "grad_norm": 0.1165216714143753,
3033
+ "learning_rate": 9.794871794871795e-06,
3034
+ "loss": 0.0194,
3035
+ "step": 10450
3036
+ },
3037
+ {
3038
+ "epoch": 11.536343612334802,
3039
+ "grad_norm": 0.16423256695270538,
3040
+ "learning_rate": 9.76923076923077e-06,
3041
+ "loss": 0.017,
3042
+ "step": 10475
3043
+ },
3044
+ {
3045
+ "epoch": 11.56387665198238,
3046
+ "grad_norm": 0.6200090050697327,
3047
+ "learning_rate": 9.743589743589744e-06,
3048
+ "loss": 0.0233,
3049
+ "step": 10500
3050
+ },
3051
+ {
3052
+ "epoch": 11.591409691629956,
3053
+ "grad_norm": 0.3650573790073395,
3054
+ "learning_rate": 9.71794871794872e-06,
3055
+ "loss": 0.0188,
3056
+ "step": 10525
3057
+ },
3058
+ {
3059
+ "epoch": 11.618942731277533,
3060
+ "grad_norm": 0.23086689412593842,
3061
+ "learning_rate": 9.692307692307693e-06,
3062
+ "loss": 0.0166,
3063
+ "step": 10550
3064
+ },
3065
+ {
3066
+ "epoch": 11.646475770925111,
3067
+ "grad_norm": 0.28406432271003723,
3068
+ "learning_rate": 9.666666666666667e-06,
3069
+ "loss": 0.0199,
3070
+ "step": 10575
3071
+ },
3072
+ {
3073
+ "epoch": 11.674008810572687,
3074
+ "grad_norm": 0.13203246891498566,
3075
+ "learning_rate": 9.641025641025642e-06,
3076
+ "loss": 0.0169,
3077
+ "step": 10600
3078
+ },
3079
+ {
3080
+ "epoch": 11.701541850220265,
3081
+ "grad_norm": 0.3809435963630676,
3082
+ "learning_rate": 9.615384615384616e-06,
3083
+ "loss": 0.0167,
3084
+ "step": 10625
3085
+ },
3086
+ {
3087
+ "epoch": 11.729074889867842,
3088
+ "grad_norm": 0.2622781991958618,
3089
+ "learning_rate": 9.589743589743591e-06,
3090
+ "loss": 0.023,
3091
+ "step": 10650
3092
+ },
3093
+ {
3094
+ "epoch": 11.756607929515418,
3095
+ "grad_norm": 0.3118574321269989,
3096
+ "learning_rate": 9.564102564102565e-06,
3097
+ "loss": 0.0162,
3098
+ "step": 10675
3099
+ },
3100
+ {
3101
+ "epoch": 11.784140969162996,
3102
+ "grad_norm": 0.29195636510849,
3103
+ "learning_rate": 9.53846153846154e-06,
3104
+ "loss": 0.0166,
3105
+ "step": 10700
3106
+ },
3107
+ {
3108
+ "epoch": 11.811674008810574,
3109
+ "grad_norm": 0.16257286071777344,
3110
+ "learning_rate": 9.512820512820514e-06,
3111
+ "loss": 0.0186,
3112
+ "step": 10725
3113
+ },
3114
+ {
3115
+ "epoch": 11.83920704845815,
3116
+ "grad_norm": 0.2690454125404358,
3117
+ "learning_rate": 9.487179487179487e-06,
3118
+ "loss": 0.0184,
3119
+ "step": 10750
3120
+ },
3121
+ {
3122
+ "epoch": 11.866740088105727,
3123
+ "grad_norm": 0.07074102014303207,
3124
+ "learning_rate": 9.461538461538463e-06,
3125
+ "loss": 0.0147,
3126
+ "step": 10775
3127
+ },
3128
+ {
3129
+ "epoch": 11.894273127753303,
3130
+ "grad_norm": 0.0660664364695549,
3131
+ "learning_rate": 9.435897435897436e-06,
3132
+ "loss": 0.017,
3133
+ "step": 10800
3134
+ },
3135
+ {
3136
+ "epoch": 11.92180616740088,
3137
+ "grad_norm": 0.42482617497444153,
3138
+ "learning_rate": 9.410256410256412e-06,
3139
+ "loss": 0.0164,
3140
+ "step": 10825
3141
+ },
3142
+ {
3143
+ "epoch": 11.949339207048459,
3144
+ "grad_norm": 0.16394160687923431,
3145
+ "learning_rate": 9.384615384615385e-06,
3146
+ "loss": 0.0154,
3147
+ "step": 10850
3148
+ },
3149
+ {
3150
+ "epoch": 11.976872246696034,
3151
+ "grad_norm": 0.39682498574256897,
3152
+ "learning_rate": 9.358974358974359e-06,
3153
+ "loss": 0.0198,
3154
+ "step": 10875
3155
+ },
3156
+ {
3157
+ "epoch": 12.004405286343612,
3158
+ "grad_norm": 0.1381184458732605,
3159
+ "learning_rate": 9.333333333333334e-06,
3160
+ "loss": 0.0193,
3161
+ "step": 10900
3162
+ },
3163
+ {
3164
+ "epoch": 12.03193832599119,
3165
+ "grad_norm": 0.15030303597450256,
3166
+ "learning_rate": 9.307692307692308e-06,
3167
+ "loss": 0.0199,
3168
+ "step": 10925
3169
+ },
3170
+ {
3171
+ "epoch": 12.059471365638766,
3172
+ "grad_norm": 0.5344926714897156,
3173
+ "learning_rate": 9.282051282051283e-06,
3174
+ "loss": 0.0197,
3175
+ "step": 10950
3176
+ },
3177
+ {
3178
+ "epoch": 12.087004405286343,
3179
+ "grad_norm": 0.18761467933654785,
3180
+ "learning_rate": 9.256410256410257e-06,
3181
+ "loss": 0.0166,
3182
+ "step": 10975
3183
+ },
3184
+ {
3185
+ "epoch": 12.114537444933921,
3186
+ "grad_norm": 0.22124651074409485,
3187
+ "learning_rate": 9.230769230769232e-06,
3188
+ "loss": 0.0123,
3189
+ "step": 11000
3190
+ },
3191
+ {
3192
+ "epoch": 12.114537444933921,
3193
+ "eval_cer": 24.39787695023672,
3194
+ "eval_loss": 0.8476730585098267,
3195
+ "eval_runtime": 1307.0774,
3196
+ "eval_samples_per_second": 8.095,
3197
+ "eval_steps_per_second": 2.024,
3198
+ "eval_wer": 83.86610089580387,
3199
+ "step": 11000
3200
  }
3201
  ],
3202
  "logging_steps": 25,
 
3216
  "attributes": {}
3217
  }
3218
  },
3219
+ "total_flos": 1.8899575051391074e+20,
3220
  "train_batch_size": 4,
3221
  "trial_name": null,
3222
  "trial_params": null