moos124 commited on
Commit
1fd5572
·
verified ·
1 Parent(s): bccbff3

Training in progress, step 3120, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:81bb0ca4c8d3f0b7df4168a6f129ab6659286d7cc50f721d7d619b912b04441c
3
  size 70430032
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6707292d7f654e5124c3e926150bc642c498945f51878660f225650de5246c50
3
  size 70430032
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0da701cf784e178c8d5cc5b6be4781f56ce0027e182866707b0b6d82d08f50d0
3
  size 141058579
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:086d9369ed7b9b1b0db7b13e9ce72ff9f192de08d450f900a44752b156fb06a4
3
  size 141058579
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ac8ce587a1f63693d985c5a1ab868e6efb026b5e08677b84eaca40b9a02b9058
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:671dc5d364c5724905180db7a8f088b1689fd04a21018fd65eb0b930b5fd8447
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ba1b53ed33622e17fae8a729aa45522b55318b24cb423fed2491721f07b63a63
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5be2b28db77843da54a5469ae9097a28157a8cf17202b01284ef63e0481acf8e
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.6058666666666667,
6
  "eval_steps": 500,
7
- "global_step": 2840,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -2858,6 +2858,286 @@
2858
  "mean_token_accuracy": 0.7862283095717431,
2859
  "num_tokens": 13204391.0,
2860
  "step": 2840
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2861
  }
2862
  ],
2863
  "logging_steps": 10,
@@ -2877,7 +3157,7 @@
2877
  "attributes": {}
2878
  }
2879
  },
2880
- "total_flos": 6.256929604727194e+16,
2881
  "train_batch_size": 4,
2882
  "trial_name": null,
2883
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.6656,
6
  "eval_steps": 500,
7
+ "global_step": 3120,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
2858
  "mean_token_accuracy": 0.7862283095717431,
2859
  "num_tokens": 13204391.0,
2860
  "step": 2840
2861
+ },
2862
+ {
2863
+ "entropy": 0.829298897087574,
2864
+ "epoch": 0.608,
2865
+ "grad_norm": 0.2556048631668091,
2866
+ "learning_rate": 8.176813665984053e-05,
2867
+ "loss": 0.8883259773254395,
2868
+ "mean_token_accuracy": 0.789163002371788,
2869
+ "num_tokens": 13244838.0,
2870
+ "step": 2850
2871
+ },
2872
+ {
2873
+ "entropy": 0.9395963847637177,
2874
+ "epoch": 0.6101333333333333,
2875
+ "grad_norm": 0.19703006744384766,
2876
+ "learning_rate": 8.163429845813997e-05,
2877
+ "loss": 1.0494510650634765,
2878
+ "mean_token_accuracy": 0.7710079193115235,
2879
+ "num_tokens": 13290932.0,
2880
+ "step": 2860
2881
+ },
2882
+ {
2883
+ "entropy": 0.9920587949454784,
2884
+ "epoch": 0.6122666666666666,
2885
+ "grad_norm": 0.2381218671798706,
2886
+ "learning_rate": 8.150008123083838e-05,
2887
+ "loss": 1.0494998931884765,
2888
+ "mean_token_accuracy": 0.7526131421327591,
2889
+ "num_tokens": 13333787.0,
2890
+ "step": 2870
2891
+ },
2892
+ {
2893
+ "entropy": 0.9984497465193272,
2894
+ "epoch": 0.6144,
2895
+ "grad_norm": 0.25819751620292664,
2896
+ "learning_rate": 8.136548658605635e-05,
2897
+ "loss": 1.1107137680053711,
2898
+ "mean_token_accuracy": 0.7557663440704345,
2899
+ "num_tokens": 13382126.0,
2900
+ "step": 2880
2901
+ },
2902
+ {
2903
+ "entropy": 0.9907154351472854,
2904
+ "epoch": 0.6165333333333334,
2905
+ "grad_norm": 0.2328466922044754,
2906
+ "learning_rate": 8.123051613643641e-05,
2907
+ "loss": 1.1184075355529786,
2908
+ "mean_token_accuracy": 0.7595549002289772,
2909
+ "num_tokens": 13430083.0,
2910
+ "step": 2890
2911
+ },
2912
+ {
2913
+ "entropy": 0.9244011230766773,
2914
+ "epoch": 0.6186666666666667,
2915
+ "grad_norm": 0.24781359732151031,
2916
+ "learning_rate": 8.109517149912386e-05,
2917
+ "loss": 1.017502498626709,
2918
+ "mean_token_accuracy": 0.7722871780395508,
2919
+ "num_tokens": 13478876.0,
2920
+ "step": 2900
2921
+ },
2922
+ {
2923
+ "entropy": 0.8886970773339271,
2924
+ "epoch": 0.6208,
2925
+ "grad_norm": 0.2412341833114624,
2926
+ "learning_rate": 8.095945429574724e-05,
2927
+ "loss": 0.9119473457336426,
2928
+ "mean_token_accuracy": 0.7751852914690971,
2929
+ "num_tokens": 13527978.0,
2930
+ "step": 2910
2931
+ },
2932
+ {
2933
+ "entropy": 1.040999775379896,
2934
+ "epoch": 0.6229333333333333,
2935
+ "grad_norm": 0.2708323895931244,
2936
+ "learning_rate": 8.082336615239903e-05,
2937
+ "loss": 1.1017963409423828,
2938
+ "mean_token_accuracy": 0.7445731669664383,
2939
+ "num_tokens": 13579308.0,
2940
+ "step": 2920
2941
+ },
2942
+ {
2943
+ "entropy": 1.0086095616221429,
2944
+ "epoch": 0.6250666666666667,
2945
+ "grad_norm": 0.2506955564022064,
2946
+ "learning_rate": 8.068690869961613e-05,
2947
+ "loss": 1.1194355964660645,
2948
+ "mean_token_accuracy": 0.7530581071972847,
2949
+ "num_tokens": 13632480.0,
2950
+ "step": 2930
2951
+ },
2952
+ {
2953
+ "entropy": 0.9920367047190666,
2954
+ "epoch": 0.6272,
2955
+ "grad_norm": 0.28143101930618286,
2956
+ "learning_rate": 8.055008357236027e-05,
2957
+ "loss": 1.0880350112915038,
2958
+ "mean_token_accuracy": 0.7523079156875611,
2959
+ "num_tokens": 13683250.0,
2960
+ "step": 2940
2961
+ },
2962
+ {
2963
+ "entropy": 0.947841040790081,
2964
+ "epoch": 0.6293333333333333,
2965
+ "grad_norm": 0.34841635823249817,
2966
+ "learning_rate": 8.04128924099985e-05,
2967
+ "loss": 1.013569164276123,
2968
+ "mean_token_accuracy": 0.7690569952130317,
2969
+ "num_tokens": 13724761.0,
2970
+ "step": 2950
2971
+ },
2972
+ {
2973
+ "entropy": 0.8923015877604484,
2974
+ "epoch": 0.6314666666666666,
2975
+ "grad_norm": 0.24537858366966248,
2976
+ "learning_rate": 8.027533685628348e-05,
2977
+ "loss": 0.9606434822082519,
2978
+ "mean_token_accuracy": 0.7777309969067574,
2979
+ "num_tokens": 13771701.0,
2980
+ "step": 2960
2981
+ },
2982
+ {
2983
+ "entropy": 1.082998887449503,
2984
+ "epoch": 0.6336,
2985
+ "grad_norm": 0.2772109806537628,
2986
+ "learning_rate": 8.013741855933386e-05,
2987
+ "loss": 1.155489444732666,
2988
+ "mean_token_accuracy": 0.7356668919324875,
2989
+ "num_tokens": 13824969.0,
2990
+ "step": 2970
2991
+ },
2992
+ {
2993
+ "entropy": 1.0548067845404148,
2994
+ "epoch": 0.6357333333333334,
2995
+ "grad_norm": 0.2706131041049957,
2996
+ "learning_rate": 7.999913917161446e-05,
2997
+ "loss": 1.1606884002685547,
2998
+ "mean_token_accuracy": 0.7461161836981773,
2999
+ "num_tokens": 13879673.0,
3000
+ "step": 2980
3001
+ },
3002
+ {
3003
+ "entropy": 0.9122042678296566,
3004
+ "epoch": 0.6378666666666667,
3005
+ "grad_norm": 0.28579071164131165,
3006
+ "learning_rate": 7.986050034991646e-05,
3007
+ "loss": 1.0014433860778809,
3008
+ "mean_token_accuracy": 0.7702639386057853,
3009
+ "num_tokens": 13923893.0,
3010
+ "step": 2990
3011
+ },
3012
+ {
3013
+ "entropy": 0.856528140604496,
3014
+ "epoch": 0.64,
3015
+ "grad_norm": 0.2646186351776123,
3016
+ "learning_rate": 7.972150375533767e-05,
3017
+ "loss": 0.9789193153381348,
3018
+ "mean_token_accuracy": 0.7824795439839363,
3019
+ "num_tokens": 13967914.0,
3020
+ "step": 3000
3021
+ },
3022
+ {
3023
+ "entropy": 1.013469608873129,
3024
+ "epoch": 0.6421333333333333,
3025
+ "grad_norm": 0.2540909945964813,
3026
+ "learning_rate": 7.958215105326252e-05,
3027
+ "loss": 1.1425801277160645,
3028
+ "mean_token_accuracy": 0.7503237001597881,
3029
+ "num_tokens": 14016335.0,
3030
+ "step": 3010
3031
+ },
3032
+ {
3033
+ "entropy": 0.9561307951807976,
3034
+ "epoch": 0.6442666666666667,
3035
+ "grad_norm": 0.2495027333498001,
3036
+ "learning_rate": 7.94424439133421e-05,
3037
+ "loss": 1.0421770095825196,
3038
+ "mean_token_accuracy": 0.7604482308030128,
3039
+ "num_tokens": 14060745.0,
3040
+ "step": 3020
3041
+ },
3042
+ {
3043
+ "entropy": 0.9330584339797496,
3044
+ "epoch": 0.6464,
3045
+ "grad_norm": 0.26480352878570557,
3046
+ "learning_rate": 7.930238400947422e-05,
3047
+ "loss": 1.0622355461120605,
3048
+ "mean_token_accuracy": 0.7683120101690293,
3049
+ "num_tokens": 14108255.0,
3050
+ "step": 3030
3051
+ },
3052
+ {
3053
+ "entropy": 0.8226673573255538,
3054
+ "epoch": 0.6485333333333333,
3055
+ "grad_norm": 0.2883199453353882,
3056
+ "learning_rate": 7.916197301978331e-05,
3057
+ "loss": 0.8736177444458008,
3058
+ "mean_token_accuracy": 0.7835568472743034,
3059
+ "num_tokens": 14151595.0,
3060
+ "step": 3040
3061
+ },
3062
+ {
3063
+ "entropy": 1.0103112280368804,
3064
+ "epoch": 0.6506666666666666,
3065
+ "grad_norm": 0.2573588788509369,
3066
+ "learning_rate": 7.902121262660036e-05,
3067
+ "loss": 1.1782626152038573,
3068
+ "mean_token_accuracy": 0.7547322385013103,
3069
+ "num_tokens": 14198658.0,
3070
+ "step": 3050
3071
+ },
3072
+ {
3073
+ "entropy": 0.9194101721048356,
3074
+ "epoch": 0.6528,
3075
+ "grad_norm": 0.22869926691055298,
3076
+ "learning_rate": 7.888010451644265e-05,
3077
+ "loss": 0.96375732421875,
3078
+ "mean_token_accuracy": 0.7731851547956466,
3079
+ "num_tokens": 14243252.0,
3080
+ "step": 3060
3081
+ },
3082
+ {
3083
+ "entropy": 0.927897697687149,
3084
+ "epoch": 0.6549333333333334,
3085
+ "grad_norm": 0.32361456751823425,
3086
+ "learning_rate": 7.873865037999373e-05,
3087
+ "loss": 1.0542486190795899,
3088
+ "mean_token_accuracy": 0.7636147439479828,
3089
+ "num_tokens": 14290318.0,
3090
+ "step": 3070
3091
+ },
3092
+ {
3093
+ "entropy": 0.8857385322451592,
3094
+ "epoch": 0.6570666666666667,
3095
+ "grad_norm": 0.25951746106147766,
3096
+ "learning_rate": 7.859685191208297e-05,
3097
+ "loss": 0.9199460983276367,
3098
+ "mean_token_accuracy": 0.7751095175743103,
3099
+ "num_tokens": 14341937.0,
3100
+ "step": 3080
3101
+ },
3102
+ {
3103
+ "entropy": 0.9319920368492604,
3104
+ "epoch": 0.6592,
3105
+ "grad_norm": 0.22098122537136078,
3106
+ "learning_rate": 7.845471081166535e-05,
3107
+ "loss": 1.057561206817627,
3108
+ "mean_token_accuracy": 0.763427771627903,
3109
+ "num_tokens": 14388811.0,
3110
+ "step": 3090
3111
+ },
3112
+ {
3113
+ "entropy": 0.9401551052927971,
3114
+ "epoch": 0.6613333333333333,
3115
+ "grad_norm": 0.25181668996810913,
3116
+ "learning_rate": 7.831222878180115e-05,
3117
+ "loss": 1.0170879364013672,
3118
+ "mean_token_accuracy": 0.7671449035406113,
3119
+ "num_tokens": 14432608.0,
3120
+ "step": 3100
3121
+ },
3122
+ {
3123
+ "entropy": 0.9817736372351646,
3124
+ "epoch": 0.6634666666666666,
3125
+ "grad_norm": 0.25245943665504456,
3126
+ "learning_rate": 7.816940752963543e-05,
3127
+ "loss": 1.1231375694274903,
3128
+ "mean_token_accuracy": 0.7525465905666351,
3129
+ "num_tokens": 14483062.0,
3130
+ "step": 3110
3131
+ },
3132
+ {
3133
+ "entropy": 1.032941934466362,
3134
+ "epoch": 0.6656,
3135
+ "grad_norm": 0.255884051322937,
3136
+ "learning_rate": 7.80262487663777e-05,
3137
+ "loss": 1.1379814147949219,
3138
+ "mean_token_accuracy": 0.7467011958360672,
3139
+ "num_tokens": 14526227.0,
3140
+ "step": 3120
3141
  }
3142
  ],
3143
  "logging_steps": 10,
 
3157
  "attributes": {}
3158
  }
3159
  },
3160
+ "total_flos": 6.879296464710451e+16,
3161
  "train_batch_size": 4,
3162
  "trial_name": null,
3163
  "trial_params": null