FormlessAI commited on
Commit
32989f6
·
verified ·
1 Parent(s): bf2d363

Training in progress, epoch 0, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a6b3d9d6f8c7c3d98a19f31f4f971e689cc6fdfb0852e1e518bf400fc7d18e3e
3
  size 1037269336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19b1fa32ac470abea9aaed5df0314fdf8255e2f5d17488e027920e596ac1b454
3
  size 1037269336
last-checkpoint/global_step2200/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:17f06d8019305880e63ff917b8eaeade273a1f4efe92b142374a9f453b975ba1
3
+ size 781993445
last-checkpoint/global_step2200/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d88ca866ed4caf5fe9bac9eb2dbf7945afd594a1ced48078e756b2ec4f52391
3
+ size 781993509
last-checkpoint/global_step2200/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e2c549dac5e475a7e15a608b79addfb82769ab62a4fb5a74e1660ba75c8c15f
3
+ size 781993509
last-checkpoint/global_step2200/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:577761a19602e32eca30c81aefb3456cd3a26b9218d94e1ea056cc48e18c2862
3
+ size 781993509
last-checkpoint/global_step2200/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3476292ffcb6799b5340968f39c80013d42c347ec1a8f3b423d91fa1ade313f7
3
+ size 2610290277
last-checkpoint/latest CHANGED
@@ -1 +1 @@
1
- global_step2050
 
1
+ global_step2200
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c193fe0eb5414b6e7724a1b6744c6fa4f71192c50142788a1655017eb0888732
3
  size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e9aa753095cc0a44fced50afca6bff1b99146c481ddc3dc764d689ff5546d5fd
3
  size 15429
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7433361eaf9398847ea003a5f4af9a337f1cc9a3b83827c19956da148a1d9e34
3
  size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90c2966bfb4a402e04ec2751d9f8452dc016d605399a989dce9bed4000125da0
3
  size 15429
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8e80352d2361ac117f2bf39b8122fb7a7bcfa982eaa10345a5e5e36808edcf2c
3
  size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:68fa3782d3dbab732db659905737cfd4c32e0162423b6b3bf8864f2d1fee1b91
3
  size 15429
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:af1d5a16d78e90cc8be44fb0342444095eba9473e2cb4a34b58006386e796243
3
  size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e218ffc86ec50875e9f6816271fc0465b75694055819b0e37bcd282c94f6dbe5
3
  size 15429
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6c2c94b278df9a49d0ea9e3b3354a733420163871be4c32635bfe524c284f7ac
3
  size 1401
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0282a728c75d0bb9e123936361d8d60683d939f3df4b2863405d14fc34b553e7
3
  size 1401
last-checkpoint/trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
  "best_global_step": null,
3
- "best_metric": 2.1106083393096924,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.2980084314580608,
6
  "eval_steps": 50,
7
- "global_step": 2050,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -3206,6 +3206,240 @@
3206
  "eval_samples_per_second": 175.617,
3207
  "eval_steps_per_second": 11.013,
3208
  "step": 2050
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3209
  }
3210
  ],
3211
  "logging_steps": 5,
@@ -3234,7 +3468,7 @@
3234
  "attributes": {}
3235
  }
3236
  },
3237
- "total_flos": 5.336645061784371e+17,
3238
  "train_batch_size": 4,
3239
  "trial_name": null,
3240
  "trial_params": null
 
1
  {
2
  "best_global_step": null,
3
+ "best_metric": 2.08577561378479,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.31981392644279694,
6
  "eval_steps": 50,
7
+ "global_step": 2200,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
3206
  "eval_samples_per_second": 175.617,
3207
  "eval_steps_per_second": 11.013,
3208
  "step": 2050
3209
+ },
3210
+ {
3211
+ "epoch": 0.2987352812908853,
3212
+ "grad_norm": 2.2958388328552246,
3213
+ "learning_rate": 9.087037358857628e-05,
3214
+ "loss": 2.1674,
3215
+ "step": 2055
3216
+ },
3217
+ {
3218
+ "epoch": 0.29946213112370984,
3219
+ "grad_norm": 2.6375505924224854,
3220
+ "learning_rate": 9.082413186059305e-05,
3221
+ "loss": 2.0371,
3222
+ "step": 2060
3223
+ },
3224
+ {
3225
+ "epoch": 0.3001889809565344,
3226
+ "grad_norm": 2.563561201095581,
3227
+ "learning_rate": 9.077778760554678e-05,
3228
+ "loss": 2.2449,
3229
+ "step": 2065
3230
+ },
3231
+ {
3232
+ "epoch": 0.3009158307893589,
3233
+ "grad_norm": 2.3851094245910645,
3234
+ "learning_rate": 9.07313409401091e-05,
3235
+ "loss": 2.2764,
3236
+ "step": 2070
3237
+ },
3238
+ {
3239
+ "epoch": 0.30164268062218347,
3240
+ "grad_norm": 2.5274460315704346,
3241
+ "learning_rate": 9.068479198120939e-05,
3242
+ "loss": 1.9841,
3243
+ "step": 2075
3244
+ },
3245
+ {
3246
+ "epoch": 0.302369530455008,
3247
+ "grad_norm": 2.51540470123291,
3248
+ "learning_rate": 9.063814084603465e-05,
3249
+ "loss": 2.0978,
3250
+ "step": 2080
3251
+ },
3252
+ {
3253
+ "epoch": 0.30309638028783253,
3254
+ "grad_norm": 2.290086507797241,
3255
+ "learning_rate": 9.059138765202903e-05,
3256
+ "loss": 2.0059,
3257
+ "step": 2085
3258
+ },
3259
+ {
3260
+ "epoch": 0.3038232301206571,
3261
+ "grad_norm": 2.4995152950286865,
3262
+ "learning_rate": 9.054453251689364e-05,
3263
+ "loss": 2.2743,
3264
+ "step": 2090
3265
+ },
3266
+ {
3267
+ "epoch": 0.3045500799534816,
3268
+ "grad_norm": 2.180800199508667,
3269
+ "learning_rate": 9.049757555858624e-05,
3270
+ "loss": 2.1006,
3271
+ "step": 2095
3272
+ },
3273
+ {
3274
+ "epoch": 0.30527692978630616,
3275
+ "grad_norm": 2.5430526733398438,
3276
+ "learning_rate": 9.04505168953209e-05,
3277
+ "loss": 2.2312,
3278
+ "step": 2100
3279
+ },
3280
+ {
3281
+ "epoch": 0.30527692978630616,
3282
+ "eval_loss": 2.094419240951538,
3283
+ "eval_runtime": 21.9822,
3284
+ "eval_samples_per_second": 150.167,
3285
+ "eval_steps_per_second": 9.417,
3286
+ "step": 2100
3287
+ },
3288
+ {
3289
+ "epoch": 0.30600377961913067,
3290
+ "grad_norm": 2.994030237197876,
3291
+ "learning_rate": 9.040335664556774e-05,
3292
+ "loss": 2.1454,
3293
+ "step": 2105
3294
+ },
3295
+ {
3296
+ "epoch": 0.30673062945195523,
3297
+ "grad_norm": 2.507899761199951,
3298
+ "learning_rate": 9.035609492805267e-05,
3299
+ "loss": 2.3506,
3300
+ "step": 2110
3301
+ },
3302
+ {
3303
+ "epoch": 0.3074574792847798,
3304
+ "grad_norm": 2.3985066413879395,
3305
+ "learning_rate": 9.030873186175699e-05,
3306
+ "loss": 2.1076,
3307
+ "step": 2115
3308
+ },
3309
+ {
3310
+ "epoch": 0.3081843291176043,
3311
+ "grad_norm": 2.315556764602661,
3312
+ "learning_rate": 9.026126756591716e-05,
3313
+ "loss": 1.9807,
3314
+ "step": 2120
3315
+ },
3316
+ {
3317
+ "epoch": 0.30891117895042886,
3318
+ "grad_norm": 2.4136197566986084,
3319
+ "learning_rate": 9.021370216002447e-05,
3320
+ "loss": 2.2067,
3321
+ "step": 2125
3322
+ },
3323
+ {
3324
+ "epoch": 0.30963802878325336,
3325
+ "grad_norm": 2.6457340717315674,
3326
+ "learning_rate": 9.016603576382481e-05,
3327
+ "loss": 2.3536,
3328
+ "step": 2130
3329
+ },
3330
+ {
3331
+ "epoch": 0.3103648786160779,
3332
+ "grad_norm": 2.527038097381592,
3333
+ "learning_rate": 9.011826849731824e-05,
3334
+ "loss": 2.1984,
3335
+ "step": 2135
3336
+ },
3337
+ {
3338
+ "epoch": 0.3110917284489025,
3339
+ "grad_norm": 2.422018527984619,
3340
+ "learning_rate": 9.007040048075882e-05,
3341
+ "loss": 2.3617,
3342
+ "step": 2140
3343
+ },
3344
+ {
3345
+ "epoch": 0.311818578281727,
3346
+ "grad_norm": 2.45200777053833,
3347
+ "learning_rate": 9.002243183465422e-05,
3348
+ "loss": 2.2631,
3349
+ "step": 2145
3350
+ },
3351
+ {
3352
+ "epoch": 0.31254542811455155,
3353
+ "grad_norm": 2.2823517322540283,
3354
+ "learning_rate": 8.997436267976544e-05,
3355
+ "loss": 1.9974,
3356
+ "step": 2150
3357
+ },
3358
+ {
3359
+ "epoch": 0.31254542811455155,
3360
+ "eval_loss": 2.101454019546509,
3361
+ "eval_runtime": 18.9568,
3362
+ "eval_samples_per_second": 174.133,
3363
+ "eval_steps_per_second": 10.92,
3364
+ "step": 2150
3365
+ },
3366
+ {
3367
+ "epoch": 0.31327227794737605,
3368
+ "grad_norm": 2.5826852321624756,
3369
+ "learning_rate": 8.992619313710653e-05,
3370
+ "loss": 2.2736,
3371
+ "step": 2155
3372
+ },
3373
+ {
3374
+ "epoch": 0.3139991277802006,
3375
+ "grad_norm": 2.4211437702178955,
3376
+ "learning_rate": 8.987792332794426e-05,
3377
+ "loss": 2.2469,
3378
+ "step": 2160
3379
+ },
3380
+ {
3381
+ "epoch": 0.3147259776130252,
3382
+ "grad_norm": 3.2002980709075928,
3383
+ "learning_rate": 8.98295533737978e-05,
3384
+ "loss": 2.2387,
3385
+ "step": 2165
3386
+ },
3387
+ {
3388
+ "epoch": 0.3154528274458497,
3389
+ "grad_norm": 2.8662610054016113,
3390
+ "learning_rate": 8.978108339643846e-05,
3391
+ "loss": 2.2728,
3392
+ "step": 2170
3393
+ },
3394
+ {
3395
+ "epoch": 0.31617967727867424,
3396
+ "grad_norm": 2.5767691135406494,
3397
+ "learning_rate": 8.973251351788936e-05,
3398
+ "loss": 2.0728,
3399
+ "step": 2175
3400
+ },
3401
+ {
3402
+ "epoch": 0.31690652711149875,
3403
+ "grad_norm": 2.2617924213409424,
3404
+ "learning_rate": 8.968384386042512e-05,
3405
+ "loss": 2.0235,
3406
+ "step": 2180
3407
+ },
3408
+ {
3409
+ "epoch": 0.3176333769443233,
3410
+ "grad_norm": 2.60357928276062,
3411
+ "learning_rate": 8.96350745465715e-05,
3412
+ "loss": 2.0803,
3413
+ "step": 2185
3414
+ },
3415
+ {
3416
+ "epoch": 0.3183602267771478,
3417
+ "grad_norm": 2.360905408859253,
3418
+ "learning_rate": 8.958620569910522e-05,
3419
+ "loss": 2.1212,
3420
+ "step": 2190
3421
+ },
3422
+ {
3423
+ "epoch": 0.3190870766099724,
3424
+ "grad_norm": 2.760329246520996,
3425
+ "learning_rate": 8.953723744105356e-05,
3426
+ "loss": 2.2397,
3427
+ "step": 2195
3428
+ },
3429
+ {
3430
+ "epoch": 0.31981392644279694,
3431
+ "grad_norm": 2.653019428253174,
3432
+ "learning_rate": 8.948816989569402e-05,
3433
+ "loss": 2.1049,
3434
+ "step": 2200
3435
+ },
3436
+ {
3437
+ "epoch": 0.31981392644279694,
3438
+ "eval_loss": 2.08577561378479,
3439
+ "eval_runtime": 18.7698,
3440
+ "eval_samples_per_second": 175.867,
3441
+ "eval_steps_per_second": 11.028,
3442
+ "step": 2200
3443
  }
3444
  ],
3445
  "logging_steps": 5,
 
3468
  "attributes": {}
3469
  }
3470
  },
3471
+ "total_flos": 5.7344243893744435e+17,
3472
  "train_batch_size": 4,
3473
  "trial_name": null,
3474
  "trial_params": null