8BitStudio commited on
Commit
2d78e73
·
verified ·
1 Parent(s): 9f95604

Training in progress, step 24000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:299b24fe69c89f19141b9f985a9ac826c3a53ad4e1b08b8aba5729be39c93c43
3
  size 1520630616
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a5e5a99de9ebd5a605eb747b364712a2371d3faff976fb42a4e2c4eff124586
3
  size 1520630616
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2578fa210b28417d8f969fa905bceff91b35a10909b4f603355ac6d743992a10
3
  size 3041448587
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b90f9e040c4d2727932a8fb5449e7e222d2b34c261ceee7f00e20f78d73acb5
3
  size 3041448587
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:59dbdf3564f71a619277fad1d7b29f944b0a8aee767f1ee531e2a42c249a6709
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf31faff1a59206513a6140313f60a81b0b7bbfaceaf131da05eee348e2a75b6
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c5b97fc3e9888373aed6e862ae95add028b1c9773804bea656915decaab6270d
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a5b896146d5d8a1fece26c83d1cdd06bac435f33fada598258a6302b90095e53
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 6.014666666666667,
6
  "eval_steps": 500,
7
- "global_step": 22000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -3088,6 +3088,286 @@
3088
  "learning_rate": 0.00026558709954008095,
3089
  "loss": 1.566,
3090
  "step": 22000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3091
  }
3092
  ],
3093
  "logging_steps": 50,
@@ -3107,7 +3387,7 @@
3107
  "attributes": {}
3108
  }
3109
  },
3110
- "total_flos": 1.1765225285807505e+19,
3111
  "train_batch_size": 16,
3112
  "trial_name": null,
3113
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 6.036524590163935,
6
  "eval_steps": 500,
7
+ "global_step": 24000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
3088
  "learning_rate": 0.00026558709954008095,
3089
  "loss": 1.566,
3090
  "step": 22000
3091
+ },
3092
+ {
3093
+ "epoch": 6.015213114754098,
3094
+ "grad_norm": 0.62890625,
3095
+ "learning_rate": 0.00026541837871893367,
3096
+ "loss": 1.5595,
3097
+ "step": 22050
3098
+ },
3099
+ {
3100
+ "epoch": 6.01575956284153,
3101
+ "grad_norm": 0.5625,
3102
+ "learning_rate": 0.0002652492991748029,
3103
+ "loss": 1.5206,
3104
+ "step": 22100
3105
+ },
3106
+ {
3107
+ "epoch": 6.016306010928962,
3108
+ "grad_norm": 0.640625,
3109
+ "learning_rate": 0.00026507986143319164,
3110
+ "loss": 1.5374,
3111
+ "step": 22150
3112
+ },
3113
+ {
3114
+ "epoch": 6.016852459016394,
3115
+ "grad_norm": 0.703125,
3116
+ "learning_rate": 0.0002649100660207164,
3117
+ "loss": 1.522,
3118
+ "step": 22200
3119
+ },
3120
+ {
3121
+ "epoch": 6.017398907103825,
3122
+ "grad_norm": 0.65625,
3123
+ "learning_rate": 0.0002647399134651053,
3124
+ "loss": 1.5532,
3125
+ "step": 22250
3126
+ },
3127
+ {
3128
+ "epoch": 6.017945355191257,
3129
+ "grad_norm": 0.63671875,
3130
+ "learning_rate": 0.0002645694042951963,
3131
+ "loss": 1.5274,
3132
+ "step": 22300
3133
+ },
3134
+ {
3135
+ "epoch": 6.018491803278689,
3136
+ "grad_norm": 0.65625,
3137
+ "learning_rate": 0.00026439853904093586,
3138
+ "loss": 1.517,
3139
+ "step": 22350
3140
+ },
3141
+ {
3142
+ "epoch": 6.01903825136612,
3143
+ "grad_norm": 0.6875,
3144
+ "learning_rate": 0.00026422731823337717,
3145
+ "loss": 1.5197,
3146
+ "step": 22400
3147
+ },
3148
+ {
3149
+ "epoch": 6.019584699453552,
3150
+ "grad_norm": 0.59765625,
3151
+ "learning_rate": 0.0002640557424046784,
3152
+ "loss": 1.5081,
3153
+ "step": 22450
3154
+ },
3155
+ {
3156
+ "epoch": 6.020131147540984,
3157
+ "grad_norm": 0.65234375,
3158
+ "learning_rate": 0.0002638838120881012,
3159
+ "loss": 1.5526,
3160
+ "step": 22500
3161
+ },
3162
+ {
3163
+ "epoch": 6.020677595628415,
3164
+ "grad_norm": 0.62109375,
3165
+ "learning_rate": 0.000263711527818009,
3166
+ "loss": 1.534,
3167
+ "step": 22550
3168
+ },
3169
+ {
3170
+ "epoch": 6.021224043715847,
3171
+ "grad_norm": 0.60546875,
3172
+ "learning_rate": 0.0002635388901298652,
3173
+ "loss": 1.5156,
3174
+ "step": 22600
3175
+ },
3176
+ {
3177
+ "epoch": 6.021770491803279,
3178
+ "grad_norm": 0.71875,
3179
+ "learning_rate": 0.0002633658995602318,
3180
+ "loss": 1.5402,
3181
+ "step": 22650
3182
+ },
3183
+ {
3184
+ "epoch": 6.02231693989071,
3185
+ "grad_norm": 0.68359375,
3186
+ "learning_rate": 0.0002631925566467674,
3187
+ "loss": 1.5367,
3188
+ "step": 22700
3189
+ },
3190
+ {
3191
+ "epoch": 6.022863387978142,
3192
+ "grad_norm": 0.62890625,
3193
+ "learning_rate": 0.00026301886192822585,
3194
+ "loss": 1.5126,
3195
+ "step": 22750
3196
+ },
3197
+ {
3198
+ "epoch": 6.023409836065574,
3199
+ "grad_norm": 0.625,
3200
+ "learning_rate": 0.00026284481594445434,
3201
+ "loss": 1.5097,
3202
+ "step": 22800
3203
+ },
3204
+ {
3205
+ "epoch": 6.023956284153005,
3206
+ "grad_norm": 0.60546875,
3207
+ "learning_rate": 0.00026267041923639175,
3208
+ "loss": 1.5058,
3209
+ "step": 22850
3210
+ },
3211
+ {
3212
+ "epoch": 6.024502732240437,
3213
+ "grad_norm": 0.703125,
3214
+ "learning_rate": 0.00026249567234606707,
3215
+ "loss": 1.5004,
3216
+ "step": 22900
3217
+ },
3218
+ {
3219
+ "epoch": 6.025049180327869,
3220
+ "grad_norm": 0.60546875,
3221
+ "learning_rate": 0.00026232057581659777,
3222
+ "loss": 1.4884,
3223
+ "step": 22950
3224
+ },
3225
+ {
3226
+ "epoch": 6.0255956284153,
3227
+ "grad_norm": 0.65234375,
3228
+ "learning_rate": 0.0002621451301921878,
3229
+ "loss": 1.5884,
3230
+ "step": 23000
3231
+ },
3232
+ {
3233
+ "epoch": 6.026142076502732,
3234
+ "grad_norm": 0.6484375,
3235
+ "learning_rate": 0.00026196933601812616,
3236
+ "loss": 1.565,
3237
+ "step": 23050
3238
+ },
3239
+ {
3240
+ "epoch": 6.026688524590164,
3241
+ "grad_norm": 0.69140625,
3242
+ "learning_rate": 0.00026179319384078535,
3243
+ "loss": 1.5399,
3244
+ "step": 23100
3245
+ },
3246
+ {
3247
+ "epoch": 6.027234972677595,
3248
+ "grad_norm": 0.57421875,
3249
+ "learning_rate": 0.0002616167042076192,
3250
+ "loss": 1.5319,
3251
+ "step": 23150
3252
+ },
3253
+ {
3254
+ "epoch": 6.027781420765027,
3255
+ "grad_norm": 0.62890625,
3256
+ "learning_rate": 0.0002614398676671616,
3257
+ "loss": 1.5379,
3258
+ "step": 23200
3259
+ },
3260
+ {
3261
+ "epoch": 6.028327868852459,
3262
+ "grad_norm": 0.69921875,
3263
+ "learning_rate": 0.0002612626847690247,
3264
+ "loss": 1.5344,
3265
+ "step": 23250
3266
+ },
3267
+ {
3268
+ "epoch": 6.02887431693989,
3269
+ "grad_norm": 0.59765625,
3270
+ "learning_rate": 0.0002610851560638968,
3271
+ "loss": 1.5054,
3272
+ "step": 23300
3273
+ },
3274
+ {
3275
+ "epoch": 6.029420765027322,
3276
+ "grad_norm": 0.6484375,
3277
+ "learning_rate": 0.0002609072821035415,
3278
+ "loss": 1.5421,
3279
+ "step": 23350
3280
+ },
3281
+ {
3282
+ "epoch": 6.029967213114754,
3283
+ "grad_norm": 0.69921875,
3284
+ "learning_rate": 0.00026072906344079484,
3285
+ "loss": 1.5625,
3286
+ "step": 23400
3287
+ },
3288
+ {
3289
+ "epoch": 6.0305136612021855,
3290
+ "grad_norm": 0.59765625,
3291
+ "learning_rate": 0.0002605505006295648,
3292
+ "loss": 1.5476,
3293
+ "step": 23450
3294
+ },
3295
+ {
3296
+ "epoch": 6.031060109289617,
3297
+ "grad_norm": 0.609375,
3298
+ "learning_rate": 0.00026037159422482865,
3299
+ "loss": 1.537,
3300
+ "step": 23500
3301
+ },
3302
+ {
3303
+ "epoch": 6.031606557377049,
3304
+ "grad_norm": 0.60546875,
3305
+ "learning_rate": 0.00026019234478263155,
3306
+ "loss": 1.5204,
3307
+ "step": 23550
3308
+ },
3309
+ {
3310
+ "epoch": 6.0321530054644805,
3311
+ "grad_norm": 0.69921875,
3312
+ "learning_rate": 0.000260012752860085,
3313
+ "loss": 1.5479,
3314
+ "step": 23600
3315
+ },
3316
+ {
3317
+ "epoch": 6.0326994535519125,
3318
+ "grad_norm": 0.6171875,
3319
+ "learning_rate": 0.00025983281901536474,
3320
+ "loss": 1.5304,
3321
+ "step": 23650
3322
+ },
3323
+ {
3324
+ "epoch": 6.0332459016393445,
3325
+ "grad_norm": 0.6171875,
3326
+ "learning_rate": 0.00025965254380770945,
3327
+ "loss": 1.5738,
3328
+ "step": 23700
3329
+ },
3330
+ {
3331
+ "epoch": 6.033792349726776,
3332
+ "grad_norm": 0.62109375,
3333
+ "learning_rate": 0.0002594719277974185,
3334
+ "loss": 1.5168,
3335
+ "step": 23750
3336
+ },
3337
+ {
3338
+ "epoch": 6.034338797814208,
3339
+ "grad_norm": 0.6328125,
3340
+ "learning_rate": 0.0002592909715458506,
3341
+ "loss": 1.4984,
3342
+ "step": 23800
3343
+ },
3344
+ {
3345
+ "epoch": 6.0348852459016395,
3346
+ "grad_norm": 0.57421875,
3347
+ "learning_rate": 0.0002591096756154221,
3348
+ "loss": 1.5721,
3349
+ "step": 23850
3350
+ },
3351
+ {
3352
+ "epoch": 6.035431693989071,
3353
+ "grad_norm": 0.6015625,
3354
+ "learning_rate": 0.0002589280405696048,
3355
+ "loss": 1.5369,
3356
+ "step": 23900
3357
+ },
3358
+ {
3359
+ "epoch": 6.035978142076503,
3360
+ "grad_norm": 0.61328125,
3361
+ "learning_rate": 0.00025874606697292473,
3362
+ "loss": 1.5236,
3363
+ "step": 23950
3364
+ },
3365
+ {
3366
+ "epoch": 6.036524590163935,
3367
+ "grad_norm": 0.6953125,
3368
+ "learning_rate": 0.00025856375539095986,
3369
+ "loss": 1.577,
3370
+ "step": 24000
3371
  }
3372
  ],
3373
  "logging_steps": 50,
 
3387
  "attributes": {}
3388
  }
3389
  },
3390
+ "total_flos": 1.2834709186604433e+19,
3391
  "train_batch_size": 16,
3392
  "trial_name": null,
3393
  "trial_params": null