Azrail commited on
Commit
c1be023
·
verified ·
1 Parent(s): b0180f2

Training in progress, step 19000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c92b8ea1ab7aa1c3c704ca60e66275b713cae4225ae135b904f4c11a6b994994
3
  size 517931840
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:50692c69fe3ea90614dc625956890e6dd059a4900ffb733cb441c9d9b0be1ed6
3
  size 517931840
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7a3f3c04ed042650af1b9c11df2cc35ac490889b1116ef774fd4222e5f41e410
3
  size 1035661434
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f34ad85e7a64410399bc0984c1c1c25765a6659574c5d382b0c132a27be2f0f8
3
  size 1035661434
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e5e2f21ad13dc4eb631067c76b1a1560519d302bc60e4e9cb00bba81ca70a316
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09184de0af072dcf6f15e331e61deb81a6900d407b5c7ebcb519d56082f36e97
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a234e8f8153fe3070553b0b2d9439870baa50cef586f11ec979ecf56399b8c74
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02c3d80aaacee80212417a329afbc88c74b35bad8004900a2301b44b629b4ab7
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.3953887783722338,
6
  "eval_steps": 500,
7
- "global_step": 18000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -3212,11 +3212,189 @@
3212
  "eval_steps_per_second": 19.114,
3213
  "num_input_tokens_seen": 18874368000,
3214
  "step": 18000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3215
  }
3216
  ],
3217
  "logging_steps": 50,
3218
  "max_steps": 200000,
3219
- "num_input_tokens_seen": 18874368000,
3220
  "num_train_epochs": 5,
3221
  "save_steps": 1000,
3222
  "stateful_callbacks": {
@@ -3231,7 +3409,7 @@
3231
  "attributes": {}
3232
  }
3233
  },
3234
- "total_flos": 1.0749090887368704e+19,
3235
  "train_batch_size": 64,
3236
  "trial_name": null,
3237
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.4173548216151357,
6
  "eval_steps": 500,
7
+ "global_step": 19000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
3212
  "eval_steps_per_second": 19.114,
3213
  "num_input_tokens_seen": 18874368000,
3214
  "step": 18000
3215
+ },
3216
+ {
3217
+ "epoch": 0.3964870805343789,
3218
+ "grad_norm": 0.1353403478860855,
3219
+ "learning_rate": 0.001,
3220
+ "loss": 2.724,
3221
+ "num_input_tokens_seen": 18926796800,
3222
+ "step": 18050
3223
+ },
3224
+ {
3225
+ "epoch": 0.397585382696524,
3226
+ "grad_norm": 0.15004459023475647,
3227
+ "learning_rate": 0.001,
3228
+ "loss": 2.717,
3229
+ "num_input_tokens_seen": 18979225600,
3230
+ "step": 18100
3231
+ },
3232
+ {
3233
+ "epoch": 0.3986836848586691,
3234
+ "grad_norm": 0.1293007880449295,
3235
+ "learning_rate": 0.001,
3236
+ "loss": 2.7187,
3237
+ "num_input_tokens_seen": 19031654400,
3238
+ "step": 18150
3239
+ },
3240
+ {
3241
+ "epoch": 0.3997819870208142,
3242
+ "grad_norm": 0.16373878717422485,
3243
+ "learning_rate": 0.001,
3244
+ "loss": 2.7217,
3245
+ "num_input_tokens_seen": 19084083200,
3246
+ "step": 18200
3247
+ },
3248
+ {
3249
+ "epoch": 0.4008802891829593,
3250
+ "grad_norm": 0.1529611349105835,
3251
+ "learning_rate": 0.001,
3252
+ "loss": 2.722,
3253
+ "num_input_tokens_seen": 19136512000,
3254
+ "step": 18250
3255
+ },
3256
+ {
3257
+ "epoch": 0.4019785913451044,
3258
+ "grad_norm": 0.14109951257705688,
3259
+ "learning_rate": 0.001,
3260
+ "loss": 2.7232,
3261
+ "num_input_tokens_seen": 19188940800,
3262
+ "step": 18300
3263
+ },
3264
+ {
3265
+ "epoch": 0.40307689350724946,
3266
+ "grad_norm": 0.13841493427753448,
3267
+ "learning_rate": 0.001,
3268
+ "loss": 2.7195,
3269
+ "num_input_tokens_seen": 19241369600,
3270
+ "step": 18350
3271
+ },
3272
+ {
3273
+ "epoch": 0.4041751956693946,
3274
+ "grad_norm": 0.13508476316928864,
3275
+ "learning_rate": 0.001,
3276
+ "loss": 2.7166,
3277
+ "num_input_tokens_seen": 19293798400,
3278
+ "step": 18400
3279
+ },
3280
+ {
3281
+ "epoch": 0.40527349783153965,
3282
+ "grad_norm": 0.1372646540403366,
3283
+ "learning_rate": 0.001,
3284
+ "loss": 2.7212,
3285
+ "num_input_tokens_seen": 19346227200,
3286
+ "step": 18450
3287
+ },
3288
+ {
3289
+ "epoch": 0.4063717999936848,
3290
+ "grad_norm": 0.1485033482313156,
3291
+ "learning_rate": 0.001,
3292
+ "loss": 2.7186,
3293
+ "num_input_tokens_seen": 19398656000,
3294
+ "step": 18500
3295
+ },
3296
+ {
3297
+ "epoch": 0.4063717999936848,
3298
+ "eval_loss": 2.622330904006958,
3299
+ "eval_runtime": 66.3601,
3300
+ "eval_samples_per_second": 75.346,
3301
+ "eval_steps_per_second": 18.837,
3302
+ "num_input_tokens_seen": 19398656000,
3303
+ "step": 18500
3304
+ },
3305
+ {
3306
+ "epoch": 0.40747010215582985,
3307
+ "grad_norm": 0.1484711617231369,
3308
+ "learning_rate": 0.001,
3309
+ "loss": 2.7235,
3310
+ "num_input_tokens_seen": 19451084800,
3311
+ "step": 18550
3312
+ },
3313
+ {
3314
+ "epoch": 0.408568404317975,
3315
+ "grad_norm": 0.141770601272583,
3316
+ "learning_rate": 0.001,
3317
+ "loss": 2.7225,
3318
+ "num_input_tokens_seen": 19503513600,
3319
+ "step": 18600
3320
+ },
3321
+ {
3322
+ "epoch": 0.40966670648012005,
3323
+ "grad_norm": 0.1213323250412941,
3324
+ "learning_rate": 0.001,
3325
+ "loss": 2.7212,
3326
+ "num_input_tokens_seen": 19555942400,
3327
+ "step": 18650
3328
+ },
3329
+ {
3330
+ "epoch": 0.4107650086422651,
3331
+ "grad_norm": 0.14149373769760132,
3332
+ "learning_rate": 0.001,
3333
+ "loss": 2.7181,
3334
+ "num_input_tokens_seen": 19608371200,
3335
+ "step": 18700
3336
+ },
3337
+ {
3338
+ "epoch": 0.41186331080441024,
3339
+ "grad_norm": 0.13964049518108368,
3340
+ "learning_rate": 0.001,
3341
+ "loss": 2.7147,
3342
+ "num_input_tokens_seen": 19660800000,
3343
+ "step": 18750
3344
+ },
3345
+ {
3346
+ "epoch": 0.4129616129665553,
3347
+ "grad_norm": 0.1384592205286026,
3348
+ "learning_rate": 0.001,
3349
+ "loss": 2.7141,
3350
+ "num_input_tokens_seen": 19713228800,
3351
+ "step": 18800
3352
+ },
3353
+ {
3354
+ "epoch": 0.41405991512870044,
3355
+ "grad_norm": 0.15027381479740143,
3356
+ "learning_rate": 0.001,
3357
+ "loss": 2.7185,
3358
+ "num_input_tokens_seen": 19765657600,
3359
+ "step": 18850
3360
+ },
3361
+ {
3362
+ "epoch": 0.4151582172908455,
3363
+ "grad_norm": 0.15221597254276276,
3364
+ "learning_rate": 0.001,
3365
+ "loss": 2.7206,
3366
+ "num_input_tokens_seen": 19818086400,
3367
+ "step": 18900
3368
+ },
3369
+ {
3370
+ "epoch": 0.4162565194529906,
3371
+ "grad_norm": 0.1272735893726349,
3372
+ "learning_rate": 0.001,
3373
+ "loss": 2.7183,
3374
+ "num_input_tokens_seen": 19870515200,
3375
+ "step": 18950
3376
+ },
3377
+ {
3378
+ "epoch": 0.4173548216151357,
3379
+ "grad_norm": 0.1258268654346466,
3380
+ "learning_rate": 0.001,
3381
+ "loss": 2.7117,
3382
+ "num_input_tokens_seen": 19922944000,
3383
+ "step": 19000
3384
+ },
3385
+ {
3386
+ "epoch": 0.4173548216151357,
3387
+ "eval_loss": 2.619187116622925,
3388
+ "eval_runtime": 65.7537,
3389
+ "eval_samples_per_second": 76.041,
3390
+ "eval_steps_per_second": 19.01,
3391
+ "num_input_tokens_seen": 19922944000,
3392
+ "step": 19000
3393
  }
3394
  ],
3395
  "logging_steps": 50,
3396
  "max_steps": 200000,
3397
+ "num_input_tokens_seen": 19922944000,
3398
  "num_train_epochs": 5,
3399
  "save_steps": 1000,
3400
  "stateful_callbacks": {
 
3409
  "attributes": {}
3410
  }
3411
  },
3412
+ "total_flos": 1.1346262603333632e+19,
3413
  "train_batch_size": 64,
3414
  "trial_name": null,
3415
  "trial_params": null