FormlessAI commited on
Commit
8367cfa
·
verified ·
1 Parent(s): 6665e8c

Training in progress, epoch 0, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c2e4e591593ddd75dbf24587fcbcc927b7e1d83a18cc4d79fb624322ab4dcf29
3
  size 1037269336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6b3d9d6f8c7c3d98a19f31f4f971e689cc6fdfb0852e1e518bf400fc7d18e3e
3
  size 1037269336
last-checkpoint/global_step2050/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f1f50cf1df89883314c83ce115d4bd0697b7a001c3f0a39c8e9d04a112b49f3
3
+ size 781993445
last-checkpoint/global_step2050/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8e6cf7468130c623904f39ab9c7b6335e00d8487910ecbd3230398055578d41
3
+ size 781993509
last-checkpoint/global_step2050/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1cb7f938d861138b5eda0666f727aaa6c119491f9af743c96888b33a0d9720c3
3
+ size 781993509
last-checkpoint/global_step2050/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7abc0b33c1f2b6911a516a1d19e9a578ec50b64a476e6c17539e75f003861528
3
+ size 781993509
last-checkpoint/global_step2050/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab06e3cbab5dd716dc39bd47d864f0e8bd999fd1f98642aa32b82f488405e614
3
+ size 2610290277
last-checkpoint/latest CHANGED
@@ -1 +1 @@
1
- global_step1900
 
1
+ global_step2050
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4559053f699be6df3895339df916e2ce9966a4034977cffc0d99dade1ee9496e
3
  size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c193fe0eb5414b6e7724a1b6744c6fa4f71192c50142788a1655017eb0888732
3
  size 15429
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7aab77de253b0dca254534a6073da9f6bb65e00a58ae9efbd90fafd066ad0156
3
  size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7433361eaf9398847ea003a5f4af9a337f1cc9a3b83827c19956da148a1d9e34
3
  size 15429
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8f4896d513f64ca305b2f209b96552a5a9970735a6441fbabe62fba495cba0cd
3
  size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e80352d2361ac117f2bf39b8122fb7a7bcfa982eaa10345a5e5e36808edcf2c
3
  size 15429
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f5bb3998e16522e731f7c4b020736a29330c3ce56c42847585411d4912a03c7e
3
  size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af1d5a16d78e90cc8be44fb0342444095eba9473e2cb4a34b58006386e796243
3
  size 15429
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ec5e0cbf05159acfd6692fdbfc765d0c7fdc9d5c3922f19ed25948084c2b7a92
3
  size 1401
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c2c94b278df9a49d0ea9e3b3354a733420163871be4c32635bfe524c284f7ac
3
  size 1401
last-checkpoint/trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
  "best_global_step": null,
3
- "best_metric": 2.118328332901001,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.2762029364733246,
6
  "eval_steps": 50,
7
- "global_step": 1900,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -2972,6 +2972,240 @@
2972
  "eval_samples_per_second": 175.116,
2973
  "eval_steps_per_second": 10.981,
2974
  "step": 1900
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2975
  }
2976
  ],
2977
  "logging_steps": 5,
@@ -3000,7 +3234,7 @@
3000
  "attributes": {}
3001
  }
3002
  },
3003
- "total_flos": 4.940882264289444e+17,
3004
  "train_batch_size": 4,
3005
  "trial_name": null,
3006
  "trial_params": null
 
1
  {
2
  "best_global_step": null,
3
+ "best_metric": 2.1106083393096924,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.2980084314580608,
6
  "eval_steps": 50,
7
+ "global_step": 2050,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
2972
  "eval_samples_per_second": 175.116,
2973
  "eval_steps_per_second": 10.981,
2974
  "step": 1900
2975
+ },
2976
+ {
2977
+ "epoch": 0.2769297863061492,
2978
+ "grad_norm": 2.4674739837646484,
2979
+ "learning_rate": 9.220938228335823e-05,
2980
+ "loss": 2.3273,
2981
+ "step": 1905
2982
+ },
2983
+ {
2984
+ "epoch": 0.2776566361389737,
2985
+ "grad_norm": 2.532355546951294,
2986
+ "learning_rate": 9.216626932892173e-05,
2987
+ "loss": 2.1482,
2988
+ "step": 1910
2989
+ },
2990
+ {
2991
+ "epoch": 0.27838348597179824,
2992
+ "grad_norm": 2.4314658641815186,
2993
+ "learning_rate": 9.212305046859271e-05,
2994
+ "loss": 2.2897,
2995
+ "step": 1915
2996
+ },
2997
+ {
2998
+ "epoch": 0.27911033580462274,
2999
+ "grad_norm": 2.5094339847564697,
3000
+ "learning_rate": 9.20797258111746e-05,
3001
+ "loss": 2.2804,
3002
+ "step": 1920
3003
+ },
3004
+ {
3005
+ "epoch": 0.2798371856374473,
3006
+ "grad_norm": 2.559231758117676,
3007
+ "learning_rate": 9.203629546573719e-05,
3008
+ "loss": 2.0258,
3009
+ "step": 1925
3010
+ },
3011
+ {
3012
+ "epoch": 0.28056403547027187,
3013
+ "grad_norm": 2.7684295177459717,
3014
+ "learning_rate": 9.199275954161631e-05,
3015
+ "loss": 2.0826,
3016
+ "step": 1930
3017
+ },
3018
+ {
3019
+ "epoch": 0.28129088530309637,
3020
+ "grad_norm": 2.8592333793640137,
3021
+ "learning_rate": 9.19491181484136e-05,
3022
+ "loss": 2.0619,
3023
+ "step": 1935
3024
+ },
3025
+ {
3026
+ "epoch": 0.28201773513592093,
3027
+ "grad_norm": 2.4586992263793945,
3028
+ "learning_rate": 9.190537139599621e-05,
3029
+ "loss": 2.0845,
3030
+ "step": 1940
3031
+ },
3032
+ {
3033
+ "epoch": 0.28274458496874544,
3034
+ "grad_norm": 2.6051766872406006,
3035
+ "learning_rate": 9.186151939449656e-05,
3036
+ "loss": 2.2766,
3037
+ "step": 1945
3038
+ },
3039
+ {
3040
+ "epoch": 0.28347143480157,
3041
+ "grad_norm": 2.544299602508545,
3042
+ "learning_rate": 9.181756225431198e-05,
3043
+ "loss": 2.2845,
3044
+ "step": 1950
3045
+ },
3046
+ {
3047
+ "epoch": 0.28347143480157,
3048
+ "eval_loss": 2.1199252605438232,
3049
+ "eval_runtime": 21.7179,
3050
+ "eval_samples_per_second": 151.994,
3051
+ "eval_steps_per_second": 9.531,
3052
+ "step": 1950
3053
+ },
3054
+ {
3055
+ "epoch": 0.28419828463439456,
3056
+ "grad_norm": 2.257642984390259,
3057
+ "learning_rate": 9.177350008610454e-05,
3058
+ "loss": 2.1782,
3059
+ "step": 1955
3060
+ },
3061
+ {
3062
+ "epoch": 0.28492513446721907,
3063
+ "grad_norm": 2.503793478012085,
3064
+ "learning_rate": 9.17293330008007e-05,
3065
+ "loss": 1.9996,
3066
+ "step": 1960
3067
+ },
3068
+ {
3069
+ "epoch": 0.2856519843000436,
3070
+ "grad_norm": 2.571162700653076,
3071
+ "learning_rate": 9.168506110959102e-05,
3072
+ "loss": 2.1955,
3073
+ "step": 1965
3074
+ },
3075
+ {
3076
+ "epoch": 0.28637883413286813,
3077
+ "grad_norm": 2.426426649093628,
3078
+ "learning_rate": 9.164068452392995e-05,
3079
+ "loss": 2.349,
3080
+ "step": 1970
3081
+ },
3082
+ {
3083
+ "epoch": 0.2871056839656927,
3084
+ "grad_norm": 2.418959140777588,
3085
+ "learning_rate": 9.159620335553549e-05,
3086
+ "loss": 2.2726,
3087
+ "step": 1975
3088
+ },
3089
+ {
3090
+ "epoch": 0.28783253379851725,
3091
+ "grad_norm": 2.270207166671753,
3092
+ "learning_rate": 9.155161771638894e-05,
3093
+ "loss": 2.1158,
3094
+ "step": 1980
3095
+ },
3096
+ {
3097
+ "epoch": 0.28855938363134176,
3098
+ "grad_norm": 2.4553415775299072,
3099
+ "learning_rate": 9.150692771873457e-05,
3100
+ "loss": 2.2628,
3101
+ "step": 1985
3102
+ },
3103
+ {
3104
+ "epoch": 0.2892862334641663,
3105
+ "grad_norm": 2.6823744773864746,
3106
+ "learning_rate": 9.14621334750794e-05,
3107
+ "loss": 2.2798,
3108
+ "step": 1990
3109
+ },
3110
+ {
3111
+ "epoch": 0.2900130832969908,
3112
+ "grad_norm": 2.46555757522583,
3113
+ "learning_rate": 9.141723509819289e-05,
3114
+ "loss": 2.1324,
3115
+ "step": 1995
3116
+ },
3117
+ {
3118
+ "epoch": 0.2907399331298154,
3119
+ "grad_norm": 2.363006353378296,
3120
+ "learning_rate": 9.137223270110667e-05,
3121
+ "loss": 2.0691,
3122
+ "step": 2000
3123
+ },
3124
+ {
3125
+ "epoch": 0.2907399331298154,
3126
+ "eval_loss": 2.1202690601348877,
3127
+ "eval_runtime": 18.9518,
3128
+ "eval_samples_per_second": 174.179,
3129
+ "eval_steps_per_second": 10.922,
3130
+ "step": 2000
3131
+ },
3132
+ {
3133
+ "epoch": 0.2914667829626399,
3134
+ "grad_norm": 2.6231467723846436,
3135
+ "learning_rate": 9.132712639711419e-05,
3136
+ "loss": 2.188,
3137
+ "step": 2005
3138
+ },
3139
+ {
3140
+ "epoch": 0.29219363279546445,
3141
+ "grad_norm": 2.40796160697937,
3142
+ "learning_rate": 9.128191629977054e-05,
3143
+ "loss": 2.3451,
3144
+ "step": 2010
3145
+ },
3146
+ {
3147
+ "epoch": 0.292920482628289,
3148
+ "grad_norm": 2.8170089721679688,
3149
+ "learning_rate": 9.123660252289206e-05,
3150
+ "loss": 2.2494,
3151
+ "step": 2015
3152
+ },
3153
+ {
3154
+ "epoch": 0.2936473324611135,
3155
+ "grad_norm": 2.571594715118408,
3156
+ "learning_rate": 9.119118518055617e-05,
3157
+ "loss": 2.1625,
3158
+ "step": 2020
3159
+ },
3160
+ {
3161
+ "epoch": 0.2943741822939381,
3162
+ "grad_norm": 2.5995700359344482,
3163
+ "learning_rate": 9.114566438710093e-05,
3164
+ "loss": 2.1386,
3165
+ "step": 2025
3166
+ },
3167
+ {
3168
+ "epoch": 0.2951010321267626,
3169
+ "grad_norm": 2.276432991027832,
3170
+ "learning_rate": 9.11000402571249e-05,
3171
+ "loss": 2.1441,
3172
+ "step": 2030
3173
+ },
3174
+ {
3175
+ "epoch": 0.29582788195958715,
3176
+ "grad_norm": 2.379159450531006,
3177
+ "learning_rate": 9.105431290548679e-05,
3178
+ "loss": 2.1499,
3179
+ "step": 2035
3180
+ },
3181
+ {
3182
+ "epoch": 0.2965547317924117,
3183
+ "grad_norm": 2.4301207065582275,
3184
+ "learning_rate": 9.100848244730514e-05,
3185
+ "loss": 2.2989,
3186
+ "step": 2040
3187
+ },
3188
+ {
3189
+ "epoch": 0.2972815816252362,
3190
+ "grad_norm": 2.3967955112457275,
3191
+ "learning_rate": 9.096254899795806e-05,
3192
+ "loss": 2.1492,
3193
+ "step": 2045
3194
+ },
3195
+ {
3196
+ "epoch": 0.2980084314580608,
3197
+ "grad_norm": 2.603433847427368,
3198
+ "learning_rate": 9.091651267308299e-05,
3199
+ "loss": 2.195,
3200
+ "step": 2050
3201
+ },
3202
+ {
3203
+ "epoch": 0.2980084314580608,
3204
+ "eval_loss": 2.1106083393096924,
3205
+ "eval_runtime": 18.7966,
3206
+ "eval_samples_per_second": 175.617,
3207
+ "eval_steps_per_second": 11.013,
3208
+ "step": 2050
3209
  }
3210
  ],
3211
  "logging_steps": 5,
 
3234
  "attributes": {}
3235
  }
3236
  },
3237
+ "total_flos": 5.336645061784371e+17,
3238
  "train_batch_size": 4,
3239
  "trial_name": null,
3240
  "trial_params": null