NBAmine commited on
Commit
def5536
·
verified ·
1 Parent(s): a98b017

Training in progress, step 3000, checkpoint

Browse files
last-checkpoint/adapter_config.json CHANGED
@@ -31,11 +31,11 @@
31
  "target_modules": [
32
  "v_proj",
33
  "k_proj",
 
 
34
  "gate_proj",
35
- "o_proj",
36
  "up_proj",
37
- "down_proj",
38
- "q_proj"
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
 
31
  "target_modules": [
32
  "v_proj",
33
  "k_proj",
34
+ "down_proj",
35
+ "q_proj",
36
  "gate_proj",
 
37
  "up_proj",
38
+ "o_proj"
 
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:df0b3c057589426de11702e8aa51f40578fbdc1c16b5298b4df1b3741a358543
3
  size 228140600
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ddd49b9fa83b41042972589b0185429c9038b2514af8abc9c0ad4f6f229c6c8
3
  size 228140600
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7e1a2a35f3f40624f11f416233f78a070b1dea29da95a3a90a9a787a9173de3d
3
  size 117931203
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:22dc5729293f37d17c0b6650d94819a21d18fab4c702a46d62401aec711792f3
3
  size 117931203
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:54ee403e6e7f52e165fb91ab2843ca4f38ca3d3c64d81b59c5a39f9e4c098413
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce6193889ea75b9cef214b87184b6c99e6c6f661ab938ae5ad158be7367ecf8b
3
  size 14645
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:88be0f049d620e88b111c309644f5ca8c552ca0e64dbf5a41f67ac4dd14016eb
3
  size 1383
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ab8f7fae8c5bc945ba8d0476887328f81726abcc0550ee4572fa2d3eac0adcb
3
  size 1383
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6abcf0c15a7ba90c608cb1903d96b4ad18eb9806fb694a46be4e23a52b64410b
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a3a79343e37b2abae291bedd1957475ce7f9b47f8942adec4a76182dbe5dbf9
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": 750,
3
  "best_metric": 0.5089643597602844,
4
  "best_model_checkpoint": "./adapter-phase1/checkpoint-750",
5
- "epoch": 4.32,
6
  "eval_steps": 300,
7
- "global_step": 2700,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -2876,6 +2876,318 @@
2876
  "eval_samples_per_second": 2.299,
2877
  "eval_steps_per_second": 0.575,
2878
  "step": 2700
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2879
  }
2880
  ],
2881
  "logging_steps": 10,
@@ -2895,7 +3207,7 @@
2895
  "attributes": {}
2896
  }
2897
  },
2898
- "total_flos": 4.639214588564275e+17,
2899
  "train_batch_size": 1,
2900
  "trial_name": null,
2901
  "trial_params": null
 
2
  "best_global_step": 750,
3
  "best_metric": 0.5089643597602844,
4
  "best_model_checkpoint": "./adapter-phase1/checkpoint-750",
5
+ "epoch": 4.8,
6
  "eval_steps": 300,
7
+ "global_step": 3000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
2876
  "eval_samples_per_second": 2.299,
2877
  "eval_steps_per_second": 0.575,
2878
  "step": 2700
2879
+ },
2880
+ {
2881
+ "entropy": 0.24316317560151218,
2882
+ "epoch": 4.336,
2883
+ "grad_norm": 0.757876455783844,
2884
+ "learning_rate": 1.3376e-05,
2885
+ "loss": 0.2118,
2886
+ "mean_token_accuracy": 0.9327260747551918,
2887
+ "num_tokens": 39749.0,
2888
+ "step": 2710
2889
+ },
2890
+ {
2891
+ "entropy": 0.2465177897363901,
2892
+ "epoch": 4.352,
2893
+ "grad_norm": 0.73354172706604,
2894
+ "learning_rate": 1.3056000000000002e-05,
2895
+ "loss": 0.21,
2896
+ "mean_token_accuracy": 0.9354286625981331,
2897
+ "num_tokens": 68464.0,
2898
+ "step": 2720
2899
+ },
2900
+ {
2901
+ "entropy": 0.24799817334860563,
2902
+ "epoch": 4.368,
2903
+ "grad_norm": 0.9990701675415039,
2904
+ "learning_rate": 1.2736000000000001e-05,
2905
+ "loss": 0.2039,
2906
+ "mean_token_accuracy": 0.940489636361599,
2907
+ "num_tokens": 91656.0,
2908
+ "step": 2730
2909
+ },
2910
+ {
2911
+ "entropy": 0.26067384518682957,
2912
+ "epoch": 4.384,
2913
+ "grad_norm": 0.9379425644874573,
2914
+ "learning_rate": 1.2416000000000001e-05,
2915
+ "loss": 0.2182,
2916
+ "mean_token_accuracy": 0.9411718167364598,
2917
+ "num_tokens": 110505.0,
2918
+ "step": 2740
2919
+ },
2920
+ {
2921
+ "entropy": 0.3018894817214459,
2922
+ "epoch": 4.4,
2923
+ "grad_norm": 1.0026336908340454,
2924
+ "learning_rate": 1.2096e-05,
2925
+ "loss": 0.2267,
2926
+ "mean_token_accuracy": 0.9386275008320808,
2927
+ "num_tokens": 123324.0,
2928
+ "step": 2750
2929
+ },
2930
+ {
2931
+ "entropy": 0.21805389355868102,
2932
+ "epoch": 4.416,
2933
+ "grad_norm": 0.6372848153114319,
2934
+ "learning_rate": 1.1776e-05,
2935
+ "loss": 0.1861,
2936
+ "mean_token_accuracy": 0.9427805945277214,
2937
+ "num_tokens": 163777.0,
2938
+ "step": 2760
2939
+ },
2940
+ {
2941
+ "entropy": 0.21196621540002525,
2942
+ "epoch": 4.432,
2943
+ "grad_norm": 0.5572025179862976,
2944
+ "learning_rate": 1.1456e-05,
2945
+ "loss": 0.1581,
2946
+ "mean_token_accuracy": 0.9551307797431946,
2947
+ "num_tokens": 192177.0,
2948
+ "step": 2770
2949
+ },
2950
+ {
2951
+ "entropy": 0.20902398317120968,
2952
+ "epoch": 4.448,
2953
+ "grad_norm": 0.7340620756149292,
2954
+ "learning_rate": 1.1136e-05,
2955
+ "loss": 0.1582,
2956
+ "mean_token_accuracy": 0.9570909071713686,
2957
+ "num_tokens": 215456.0,
2958
+ "step": 2780
2959
+ },
2960
+ {
2961
+ "entropy": 0.2131565590389073,
2962
+ "epoch": 4.464,
2963
+ "grad_norm": 1.0014139413833618,
2964
+ "learning_rate": 1.0816000000000001e-05,
2965
+ "loss": 0.1583,
2966
+ "mean_token_accuracy": 0.9551056247204542,
2967
+ "num_tokens": 234122.0,
2968
+ "step": 2790
2969
+ },
2970
+ {
2971
+ "entropy": 0.25133530045859515,
2972
+ "epoch": 4.48,
2973
+ "grad_norm": 0.8922705054283142,
2974
+ "learning_rate": 1.0496e-05,
2975
+ "loss": 0.1818,
2976
+ "mean_token_accuracy": 0.9524805508553982,
2977
+ "num_tokens": 246749.0,
2978
+ "step": 2800
2979
+ },
2980
+ {
2981
+ "entropy": 0.19833970288746058,
2982
+ "epoch": 4.496,
2983
+ "grad_norm": 0.8713212609291077,
2984
+ "learning_rate": 1.0176e-05,
2985
+ "loss": 0.1667,
2986
+ "mean_token_accuracy": 0.9479088947176934,
2987
+ "num_tokens": 287475.0,
2988
+ "step": 2810
2989
+ },
2990
+ {
2991
+ "entropy": 0.18820378091186285,
2992
+ "epoch": 4.5120000000000005,
2993
+ "grad_norm": 0.782958984375,
2994
+ "learning_rate": 9.856e-06,
2995
+ "loss": 0.1507,
2996
+ "mean_token_accuracy": 0.9564289052039385,
2997
+ "num_tokens": 316228.0,
2998
+ "step": 2820
2999
+ },
3000
+ {
3001
+ "entropy": 0.1986434136983007,
3002
+ "epoch": 4.5280000000000005,
3003
+ "grad_norm": 0.9405664801597595,
3004
+ "learning_rate": 9.536e-06,
3005
+ "loss": 0.1652,
3006
+ "mean_token_accuracy": 0.9527083396911621,
3007
+ "num_tokens": 339312.0,
3008
+ "step": 2830
3009
+ },
3010
+ {
3011
+ "entropy": 0.20359546076506377,
3012
+ "epoch": 4.5440000000000005,
3013
+ "grad_norm": 1.8294662237167358,
3014
+ "learning_rate": 9.216000000000001e-06,
3015
+ "loss": 0.1605,
3016
+ "mean_token_accuracy": 0.958249793574214,
3017
+ "num_tokens": 357957.0,
3018
+ "step": 2840
3019
+ },
3020
+ {
3021
+ "entropy": 0.2478945675306022,
3022
+ "epoch": 4.5600000000000005,
3023
+ "grad_norm": 1.8756585121154785,
3024
+ "learning_rate": 8.896000000000001e-06,
3025
+ "loss": 0.1791,
3026
+ "mean_token_accuracy": 0.9529225923120975,
3027
+ "num_tokens": 371074.0,
3028
+ "step": 2850
3029
+ },
3030
+ {
3031
+ "entropy": 0.19137877360917627,
3032
+ "epoch": 4.576,
3033
+ "grad_norm": 0.7811349034309387,
3034
+ "learning_rate": 8.576e-06,
3035
+ "loss": 0.1603,
3036
+ "mean_token_accuracy": 0.9505746208131314,
3037
+ "num_tokens": 412461.0,
3038
+ "step": 2860
3039
+ },
3040
+ {
3041
+ "entropy": 0.19941019406542182,
3042
+ "epoch": 4.592,
3043
+ "grad_norm": 0.8849194645881653,
3044
+ "learning_rate": 8.256e-06,
3045
+ "loss": 0.1559,
3046
+ "mean_token_accuracy": 0.9538026105612516,
3047
+ "num_tokens": 441113.0,
3048
+ "step": 2870
3049
+ },
3050
+ {
3051
+ "entropy": 0.20037598102353513,
3052
+ "epoch": 4.608,
3053
+ "grad_norm": 1.007367730140686,
3054
+ "learning_rate": 7.936e-06,
3055
+ "loss": 0.1577,
3056
+ "mean_token_accuracy": 0.9563030891120434,
3057
+ "num_tokens": 464301.0,
3058
+ "step": 2880
3059
+ },
3060
+ {
3061
+ "entropy": 0.21458538975566627,
3062
+ "epoch": 4.624,
3063
+ "grad_norm": 1.0605765581130981,
3064
+ "learning_rate": 7.616000000000001e-06,
3065
+ "loss": 0.1636,
3066
+ "mean_token_accuracy": 0.9558106277137994,
3067
+ "num_tokens": 483422.0,
3068
+ "step": 2890
3069
+ },
3070
+ {
3071
+ "entropy": 0.2460995698813349,
3072
+ "epoch": 4.64,
3073
+ "grad_norm": 1.1102747917175293,
3074
+ "learning_rate": 7.296e-06,
3075
+ "loss": 0.178,
3076
+ "mean_token_accuracy": 0.9527418158948422,
3077
+ "num_tokens": 496524.0,
3078
+ "step": 2900
3079
+ },
3080
+ {
3081
+ "entropy": 0.1917059404309839,
3082
+ "epoch": 4.656,
3083
+ "grad_norm": 0.7104383111000061,
3084
+ "learning_rate": 6.976000000000001e-06,
3085
+ "loss": 0.1692,
3086
+ "mean_token_accuracy": 0.9471572674810886,
3087
+ "num_tokens": 537262.0,
3088
+ "step": 2910
3089
+ },
3090
+ {
3091
+ "entropy": 0.19903061082586646,
3092
+ "epoch": 4.672,
3093
+ "grad_norm": 0.8522951006889343,
3094
+ "learning_rate": 6.688e-06,
3095
+ "loss": 0.1668,
3096
+ "mean_token_accuracy": 0.9495650254189968,
3097
+ "num_tokens": 566118.0,
3098
+ "step": 2920
3099
+ },
3100
+ {
3101
+ "entropy": 0.20533090075477958,
3102
+ "epoch": 4.688,
3103
+ "grad_norm": 0.7692112326622009,
3104
+ "learning_rate": 6.368000000000001e-06,
3105
+ "loss": 0.1597,
3106
+ "mean_token_accuracy": 0.9538190443068743,
3107
+ "num_tokens": 589316.0,
3108
+ "step": 2930
3109
+ },
3110
+ {
3111
+ "entropy": 0.20868746675550937,
3112
+ "epoch": 4.704,
3113
+ "grad_norm": 0.8645059466362,
3114
+ "learning_rate": 6.048e-06,
3115
+ "loss": 0.1496,
3116
+ "mean_token_accuracy": 0.9595503833144903,
3117
+ "num_tokens": 607904.0,
3118
+ "step": 2940
3119
+ },
3120
+ {
3121
+ "entropy": 0.23888139198534192,
3122
+ "epoch": 4.72,
3123
+ "grad_norm": 1.08635413646698,
3124
+ "learning_rate": 5.728e-06,
3125
+ "loss": 0.1706,
3126
+ "mean_token_accuracy": 0.9570875108242035,
3127
+ "num_tokens": 620936.0,
3128
+ "step": 2950
3129
+ },
3130
+ {
3131
+ "entropy": 0.18963255980052054,
3132
+ "epoch": 4.736,
3133
+ "grad_norm": 0.7276900410652161,
3134
+ "learning_rate": 5.4080000000000006e-06,
3135
+ "loss": 0.1633,
3136
+ "mean_token_accuracy": 0.9485368836671114,
3137
+ "num_tokens": 661079.0,
3138
+ "step": 2960
3139
+ },
3140
+ {
3141
+ "entropy": 0.19404892213642597,
3142
+ "epoch": 4.752,
3143
+ "grad_norm": 0.8436645269393921,
3144
+ "learning_rate": 5.088e-06,
3145
+ "loss": 0.1523,
3146
+ "mean_token_accuracy": 0.9547487128525972,
3147
+ "num_tokens": 689649.0,
3148
+ "step": 2970
3149
+ },
3150
+ {
3151
+ "entropy": 0.20046764588914812,
3152
+ "epoch": 4.768,
3153
+ "grad_norm": 1.0704182386398315,
3154
+ "learning_rate": 4.768e-06,
3155
+ "loss": 0.1574,
3156
+ "mean_token_accuracy": 0.9545170154422522,
3157
+ "num_tokens": 712841.0,
3158
+ "step": 2980
3159
+ },
3160
+ {
3161
+ "entropy": 0.2065018493682146,
3162
+ "epoch": 4.784,
3163
+ "grad_norm": 0.9045215249061584,
3164
+ "learning_rate": 4.4480000000000004e-06,
3165
+ "loss": 0.155,
3166
+ "mean_token_accuracy": 0.9589469760656357,
3167
+ "num_tokens": 731548.0,
3168
+ "step": 2990
3169
+ },
3170
+ {
3171
+ "entropy": 0.2458665339741856,
3172
+ "epoch": 4.8,
3173
+ "grad_norm": 1.7165741920471191,
3174
+ "learning_rate": 4.128e-06,
3175
+ "loss": 0.173,
3176
+ "mean_token_accuracy": 0.9542810652405024,
3177
+ "num_tokens": 744375.0,
3178
+ "step": 3000
3179
+ },
3180
+ {
3181
+ "epoch": 4.8,
3182
+ "eval_accuracy": 0.026236095361078154,
3183
+ "eval_entropy": 0.3239293715655804,
3184
+ "eval_loss": 0.6594926714897156,
3185
+ "eval_mean_token_accuracy": 0.8544400478601456,
3186
+ "eval_num_tokens": 744375.0,
3187
+ "eval_runtime": 966.0583,
3188
+ "eval_samples_per_second": 2.07,
3189
+ "eval_steps_per_second": 0.518,
3190
+ "step": 3000
3191
  }
3192
  ],
3193
  "logging_steps": 10,
 
3207
  "attributes": {}
3208
  }
3209
  },
3210
+ "total_flos": 5.158805165012275e+17,
3211
  "train_batch_size": 1,
3212
  "trial_name": null,
3213
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dc0c9c43aae96575e8afc416e967ac5674d13cc1a38c487b69cd4534aafef005
3
  size 6353
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab977af6525318ffc5b089ead4268f65e71f68e9d355f66185c43f4d771a6da2
3
  size 6353