NBAmine commited on
Commit
dce6a88
·
verified ·
1 Parent(s): 11c94ed

Training in progress, step 3125, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1ddd49b9fa83b41042972589b0185429c9038b2514af8abc9c0ad4f6f229c6c8
3
  size 228140600
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:31e65c9ff039c74d59b4607524385f75a8ae083b148b3a163cece010a9774af0
3
  size 228140600
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:22dc5729293f37d17c0b6650d94819a21d18fab4c702a46d62401aec711792f3
3
  size 117931203
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b96216027c02e20a6ee8541060ecd0085b74fd0ea5669cf82258347c31d3baf
3
  size 117931203
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ce6193889ea75b9cef214b87184b6c99e6c6f661ab938ae5ad158be7367ecf8b
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e788bee1c067926ef11645e418ec428402ec185fb9258e04df56296e42d2286b
3
  size 14645
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7ab8f7fae8c5bc945ba8d0476887328f81726abcc0550ee4572fa2d3eac0adcb
3
  size 1383
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e230928162c4463d462e64ab14b3906988dfebe47926d517a84f2e81ec7582c
3
  size 1383
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2a3a79343e37b2abae291bedd1957475ce7f9b47f8942adec4a76182dbe5dbf9
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b0095603c7ffc8d3152c5de9d397fd1beca2e9651bdba9b9da9fbad8a37e19c
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": 750,
3
  "best_metric": 0.5089643597602844,
4
  "best_model_checkpoint": "./adapter-phase1/checkpoint-750",
5
- "epoch": 4.8,
6
  "eval_steps": 300,
7
- "global_step": 3000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -3188,6 +3188,126 @@
3188
  "eval_samples_per_second": 2.07,
3189
  "eval_steps_per_second": 0.518,
3190
  "step": 3000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3191
  }
3192
  ],
3193
  "logging_steps": 10,
@@ -3202,12 +3322,12 @@
3202
  "should_evaluate": false,
3203
  "should_log": false,
3204
  "should_save": true,
3205
- "should_training_stop": false
3206
  },
3207
  "attributes": {}
3208
  }
3209
  },
3210
- "total_flos": 5.158805165012275e+17,
3211
  "train_batch_size": 1,
3212
  "trial_name": null,
3213
  "trial_params": null
 
2
  "best_global_step": 750,
3
  "best_metric": 0.5089643597602844,
4
  "best_model_checkpoint": "./adapter-phase1/checkpoint-750",
5
+ "epoch": 5.0,
6
  "eval_steps": 300,
7
+ "global_step": 3125,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
3188
  "eval_samples_per_second": 2.07,
3189
  "eval_steps_per_second": 0.518,
3190
  "step": 3000
3191
+ },
3192
+ {
3193
+ "entropy": 0.19047842593863606,
3194
+ "epoch": 4.816,
3195
+ "grad_norm": 0.8224709033966064,
3196
+ "learning_rate": 3.8080000000000006e-06,
3197
+ "loss": 0.1691,
3198
+ "mean_token_accuracy": 0.9483149264007806,
3199
+ "num_tokens": 785457.0,
3200
+ "step": 3010
3201
+ },
3202
+ {
3203
+ "entropy": 0.1947814745362848,
3204
+ "epoch": 4.832,
3205
+ "grad_norm": 0.8581233024597168,
3206
+ "learning_rate": 3.4880000000000003e-06,
3207
+ "loss": 0.1535,
3208
+ "mean_token_accuracy": 0.9543764512985945,
3209
+ "num_tokens": 814006.0,
3210
+ "step": 3020
3211
+ },
3212
+ {
3213
+ "entropy": 0.20228669252246617,
3214
+ "epoch": 4.848,
3215
+ "grad_norm": 0.7815537452697754,
3216
+ "learning_rate": 3.168e-06,
3217
+ "loss": 0.1539,
3218
+ "mean_token_accuracy": 0.9561178237199783,
3219
+ "num_tokens": 836843.0,
3220
+ "step": 3030
3221
+ },
3222
+ {
3223
+ "entropy": 0.2111768877133727,
3224
+ "epoch": 4.864,
3225
+ "grad_norm": 2.0849273204803467,
3226
+ "learning_rate": 2.848e-06,
3227
+ "loss": 0.1553,
3228
+ "mean_token_accuracy": 0.9579557087272406,
3229
+ "num_tokens": 855036.0,
3230
+ "step": 3040
3231
+ },
3232
+ {
3233
+ "entropy": 0.2543737689033151,
3234
+ "epoch": 4.88,
3235
+ "grad_norm": 0.9005395770072937,
3236
+ "learning_rate": 2.528e-06,
3237
+ "loss": 0.18,
3238
+ "mean_token_accuracy": 0.951928498968482,
3239
+ "num_tokens": 867473.0,
3240
+ "step": 3050
3241
+ },
3242
+ {
3243
+ "entropy": 0.19695296385325492,
3244
+ "epoch": 4.896,
3245
+ "grad_norm": 0.8913720846176147,
3246
+ "learning_rate": 2.208e-06,
3247
+ "loss": 0.1731,
3248
+ "mean_token_accuracy": 0.9454629000276327,
3249
+ "num_tokens": 905517.0,
3250
+ "step": 3060
3251
+ },
3252
+ {
3253
+ "entropy": 0.2020930268801749,
3254
+ "epoch": 4.912,
3255
+ "grad_norm": 1.0501484870910645,
3256
+ "learning_rate": 1.8880000000000002e-06,
3257
+ "loss": 0.1583,
3258
+ "mean_token_accuracy": 0.954399960488081,
3259
+ "num_tokens": 933251.0,
3260
+ "step": 3070
3261
+ },
3262
+ {
3263
+ "entropy": 0.20252155787311493,
3264
+ "epoch": 4.928,
3265
+ "grad_norm": 1.03731369972229,
3266
+ "learning_rate": 1.568e-06,
3267
+ "loss": 0.1531,
3268
+ "mean_token_accuracy": 0.9579384963959455,
3269
+ "num_tokens": 956069.0,
3270
+ "step": 3080
3271
+ },
3272
+ {
3273
+ "entropy": 0.2126692888326943,
3274
+ "epoch": 4.944,
3275
+ "grad_norm": 1.107572317123413,
3276
+ "learning_rate": 1.248e-06,
3277
+ "loss": 0.1568,
3278
+ "mean_token_accuracy": 0.9569063678383827,
3279
+ "num_tokens": 974517.0,
3280
+ "step": 3090
3281
+ },
3282
+ {
3283
+ "entropy": 0.24990466320887209,
3284
+ "epoch": 4.96,
3285
+ "grad_norm": 1.2767953872680664,
3286
+ "learning_rate": 9.28e-07,
3287
+ "loss": 0.1851,
3288
+ "mean_token_accuracy": 0.9518057998269797,
3289
+ "num_tokens": 987191.0,
3290
+ "step": 3100
3291
+ },
3292
+ {
3293
+ "entropy": 0.19635155922733247,
3294
+ "epoch": 4.976,
3295
+ "grad_norm": 0.838716447353363,
3296
+ "learning_rate": 6.08e-07,
3297
+ "loss": 0.1689,
3298
+ "mean_token_accuracy": 0.9492763552814723,
3299
+ "num_tokens": 1021442.0,
3300
+ "step": 3110
3301
+ },
3302
+ {
3303
+ "entropy": 0.21572725460864603,
3304
+ "epoch": 4.992,
3305
+ "grad_norm": 0.9043759107589722,
3306
+ "learning_rate": 2.8800000000000004e-07,
3307
+ "loss": 0.161,
3308
+ "mean_token_accuracy": 0.9549260966479778,
3309
+ "num_tokens": 1041350.0,
3310
+ "step": 3120
3311
  }
3312
  ],
3313
  "logging_steps": 10,
 
3322
  "should_evaluate": false,
3323
  "should_log": false,
3324
  "should_save": true,
3325
+ "should_training_stop": true
3326
  },
3327
  "attributes": {}
3328
  }
3329
  },
3330
+ "total_flos": 5.37035906398464e+17,
3331
  "train_batch_size": 1,
3332
  "trial_name": null,
3333
  "trial_params": null