aghatage commited on
Commit
643faf0
·
verified ·
1 Parent(s): 00f9c7e

Training in progress, step 7500, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4e18628340448d6cc6532411e4697ff75d543f26260e6f7133ea8b56b72f7242
3
  size 12017472
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e218a5bbad6f972caad0894b0b220511e0cb0cb3787c44a85e875a3ce67f3813
3
  size 12017472
last-checkpoint/global_step7500/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f0c6a776f557f95f4a5de2e5594e410d029df9d00a110f71924423c987a5e8b
3
+ size 71982309
last-checkpoint/global_step7500/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc6aedf984eb49a766b5397998d1ccdee863f9d0b635a66e9096c9fb5555965a
3
+ size 146356645
last-checkpoint/latest CHANGED
@@ -1 +1 @@
1
- global_step7000
 
1
+ global_step7500
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:26376c5c3b634bbc75ff000a6f0bd179c575c9ddbba230c22db308946450acc0
3
  size 14709
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f5c58bb9b510ddfd192f4d2021c0156080905e1ad4e17052f1d4a70bda5c74ec
3
  size 14709
last-checkpoint/trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "best_global_step": 7000,
3
- "best_metric": 0.5689130425453186,
4
- "best_model_checkpoint": "/root/leap-finetune/outputs/sft/lfm2_350m_marathi_optimized_12ep/checkpoint-7000",
5
- "epoch": 5.0872568623886565,
6
  "eval_steps": 250,
7
- "global_step": 7000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -2817,6 +2817,206 @@
2817
  "eval_samples_per_second": 43.506,
2818
  "eval_steps_per_second": 5.445,
2819
  "step": 7000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2820
  }
2821
  ],
2822
  "logging_steps": 25,
@@ -2836,7 +3036,7 @@
2836
  "attributes": {}
2837
  }
2838
  },
2839
- "total_flos": 3.888375314430034e+17,
2840
  "train_batch_size": 4,
2841
  "trial_name": null,
2842
  "trial_params": null
 
1
  {
2
+ "best_global_step": 7500,
3
+ "best_metric": 0.5647426843643188,
4
+ "best_model_checkpoint": "/root/leap-finetune/outputs/sft/lfm2_350m_marathi_optimized_12ep/checkpoint-7500",
5
+ "epoch": 5.450827122341392,
6
  "eval_steps": 250,
7
+ "global_step": 7500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
2817
  "eval_samples_per_second": 43.506,
2818
  "eval_steps_per_second": 5.445,
2819
  "step": 7000
2820
+ },
2821
+ {
2822
+ "epoch": 5.1054353753862936,
2823
+ "grad_norm": 0.7935928702354431,
2824
+ "learning_rate": 4.971038696306446e-05,
2825
+ "loss": 0.5501,
2826
+ "mean_token_accuracy": 0.8285538706183434,
2827
+ "num_tokens": 154698091.0,
2828
+ "step": 7025
2829
+ },
2830
+ {
2831
+ "epoch": 5.123613888383931,
2832
+ "grad_norm": 0.7594472169876099,
2833
+ "learning_rate": 4.952458208239385e-05,
2834
+ "loss": 0.5487,
2835
+ "mean_token_accuracy": 0.8303073984384537,
2836
+ "num_tokens": 155238389.0,
2837
+ "step": 7050
2838
+ },
2839
+ {
2840
+ "epoch": 5.141792401381567,
2841
+ "grad_norm": 0.7793622016906738,
2842
+ "learning_rate": 4.933855907853041e-05,
2843
+ "loss": 0.5526,
2844
+ "mean_token_accuracy": 0.828688297867775,
2845
+ "num_tokens": 155796109.0,
2846
+ "step": 7075
2847
+ },
2848
+ {
2849
+ "epoch": 5.159970914379204,
2850
+ "grad_norm": 0.7963124513626099,
2851
+ "learning_rate": 4.9152322211601326e-05,
2852
+ "loss": 0.5617,
2853
+ "mean_token_accuracy": 0.8250745138525963,
2854
+ "num_tokens": 156367817.0,
2855
+ "step": 7100
2856
+ },
2857
+ {
2858
+ "epoch": 5.178149427376841,
2859
+ "grad_norm": 0.815303385257721,
2860
+ "learning_rate": 4.8965875746631553e-05,
2861
+ "loss": 0.5527,
2862
+ "mean_token_accuracy": 0.8272364658117294,
2863
+ "num_tokens": 156937564.0,
2864
+ "step": 7125
2865
+ },
2866
+ {
2867
+ "epoch": 5.196327940374477,
2868
+ "grad_norm": 0.7769586443901062,
2869
+ "learning_rate": 4.8779223953446054e-05,
2870
+ "loss": 0.5539,
2871
+ "mean_token_accuracy": 0.8281649795174598,
2872
+ "num_tokens": 157487986.0,
2873
+ "step": 7150
2874
+ },
2875
+ {
2876
+ "epoch": 5.214506453372114,
2877
+ "grad_norm": 0.7640786170959473,
2878
+ "learning_rate": 4.8592371106571984e-05,
2879
+ "loss": 0.5553,
2880
+ "mean_token_accuracy": 0.8278635969758034,
2881
+ "num_tokens": 158049502.0,
2882
+ "step": 7175
2883
+ },
2884
+ {
2885
+ "epoch": 5.232684966369751,
2886
+ "grad_norm": 0.7943294644355774,
2887
+ "learning_rate": 4.8405321485140926e-05,
2888
+ "loss": 0.5515,
2889
+ "mean_token_accuracy": 0.8292607891559601,
2890
+ "num_tokens": 158573810.0,
2891
+ "step": 7200
2892
+ },
2893
+ {
2894
+ "epoch": 5.250863479367387,
2895
+ "grad_norm": 0.8353666067123413,
2896
+ "learning_rate": 4.821807937279074e-05,
2897
+ "loss": 0.5493,
2898
+ "mean_token_accuracy": 0.8291581255197525,
2899
+ "num_tokens": 159126021.0,
2900
+ "step": 7225
2901
+ },
2902
+ {
2903
+ "epoch": 5.2690419923650245,
2904
+ "grad_norm": 0.8323714137077332,
2905
+ "learning_rate": 4.8030649057567545e-05,
2906
+ "loss": 0.5574,
2907
+ "mean_token_accuracy": 0.8271696311235428,
2908
+ "num_tokens": 159687774.0,
2909
+ "step": 7250
2910
+ },
2911
+ {
2912
+ "epoch": 5.2690419923650245,
2913
+ "eval_loss": 0.5662592053413391,
2914
+ "eval_mean_token_accuracy": 0.8236163130967445,
2915
+ "eval_num_tokens": 159687774.0,
2916
+ "eval_runtime": 112.894,
2917
+ "eval_samples_per_second": 43.315,
2918
+ "eval_steps_per_second": 5.421,
2919
+ "step": 7250
2920
+ },
2921
+ {
2922
+ "epoch": 5.2872205053626615,
2923
+ "grad_norm": 0.7205966114997864,
2924
+ "learning_rate": 4.784303483182755e-05,
2925
+ "loss": 0.553,
2926
+ "mean_token_accuracy": 0.8278142037987709,
2927
+ "num_tokens": 160241228.0,
2928
+ "step": 7275
2929
+ },
2930
+ {
2931
+ "epoch": 5.305399018360298,
2932
+ "grad_norm": 0.8180447816848755,
2933
+ "learning_rate": 4.7655240992138677e-05,
2934
+ "loss": 0.5491,
2935
+ "mean_token_accuracy": 0.829489229619503,
2936
+ "num_tokens": 160767430.0,
2937
+ "step": 7300
2938
+ },
2939
+ {
2940
+ "epoch": 5.323577531357935,
2941
+ "grad_norm": 0.7637699842453003,
2942
+ "learning_rate": 4.746727183918221e-05,
2943
+ "loss": 0.5595,
2944
+ "mean_token_accuracy": 0.8261320424079895,
2945
+ "num_tokens": 161318820.0,
2946
+ "step": 7325
2947
+ },
2948
+ {
2949
+ "epoch": 5.341756044355572,
2950
+ "grad_norm": 0.7907775640487671,
2951
+ "learning_rate": 4.727913167765431e-05,
2952
+ "loss": 0.5525,
2953
+ "mean_token_accuracy": 0.8275396654009819,
2954
+ "num_tokens": 161877946.0,
2955
+ "step": 7350
2956
+ },
2957
+ {
2958
+ "epoch": 5.359934557353209,
2959
+ "grad_norm": 0.7488855719566345,
2960
+ "learning_rate": 4.7090824816167384e-05,
2961
+ "loss": 0.5516,
2962
+ "mean_token_accuracy": 0.8294497436285019,
2963
+ "num_tokens": 162425201.0,
2964
+ "step": 7375
2965
+ },
2966
+ {
2967
+ "epoch": 5.378113070350845,
2968
+ "grad_norm": 0.8791596293449402,
2969
+ "learning_rate": 4.6902355567151486e-05,
2970
+ "loss": 0.5533,
2971
+ "mean_token_accuracy": 0.8279849541187286,
2972
+ "num_tokens": 162980062.0,
2973
+ "step": 7400
2974
+ },
2975
+ {
2976
+ "epoch": 5.396291583348482,
2977
+ "grad_norm": 0.8288064002990723,
2978
+ "learning_rate": 4.671372824675549e-05,
2979
+ "loss": 0.5463,
2980
+ "mean_token_accuracy": 0.8298707720637322,
2981
+ "num_tokens": 163536063.0,
2982
+ "step": 7425
2983
+ },
2984
+ {
2985
+ "epoch": 5.414470096346119,
2986
+ "grad_norm": 0.8334540724754333,
2987
+ "learning_rate": 4.65249471747483e-05,
2988
+ "loss": 0.5488,
2989
+ "mean_token_accuracy": 0.8298037537932396,
2990
+ "num_tokens": 164096275.0,
2991
+ "step": 7450
2992
+ },
2993
+ {
2994
+ "epoch": 5.432648609343755,
2995
+ "grad_norm": 0.7712641358375549,
2996
+ "learning_rate": 4.6336016674419886e-05,
2997
+ "loss": 0.5423,
2998
+ "mean_token_accuracy": 0.8307902818918228,
2999
+ "num_tokens": 164633060.0,
3000
+ "step": 7475
3001
+ },
3002
+ {
3003
+ "epoch": 5.450827122341392,
3004
+ "grad_norm": 0.7974975109100342,
3005
+ "learning_rate": 4.614694107248228e-05,
3006
+ "loss": 0.5527,
3007
+ "mean_token_accuracy": 0.8289200633764267,
3008
+ "num_tokens": 165169760.0,
3009
+ "step": 7500
3010
+ },
3011
+ {
3012
+ "epoch": 5.450827122341392,
3013
+ "eval_loss": 0.5647426843643188,
3014
+ "eval_mean_token_accuracy": 0.82398138930595,
3015
+ "eval_num_tokens": 165169760.0,
3016
+ "eval_runtime": 113.2742,
3017
+ "eval_samples_per_second": 43.17,
3018
+ "eval_steps_per_second": 5.403,
3019
+ "step": 7500
3020
  }
3021
  ],
3022
  "logging_steps": 25,
 
3036
  "attributes": {}
3037
  }
3038
  },
3039
+ "total_flos": 4.16634638434304e+17,
3040
  "train_batch_size": 4,
3041
  "trial_name": null,
3042
  "trial_params": null