FormlessAI commited on
Commit
2c9b798
·
verified ·
1 Parent(s): db2d193

Training in progress, epoch 0, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:771f7595253335d0f7b3e5d9548620ff920977b25d1013493890387e97d73a3d
3
  size 1037269336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0b23217742b312e0ba6a642fbced78169e97e5bd94aa8ec9429ceefc05f1a76b
3
  size 1037269336
last-checkpoint/global_step4100/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b1dea63ea269d8a8db77c339ab50f09e474eadb1a6659d06ab7df2dbde5aac2c
3
+ size 781993445
last-checkpoint/global_step4100/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea94dca34e505bbaa93fb7fe6a1bcdcc87d57c1eb86c42d96a787243b93d70bb
3
+ size 781993509
last-checkpoint/global_step4100/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a02d8491507c62bd6bf4534b2406d66207877a390403265562036f29f45b719
3
+ size 781993509
last-checkpoint/global_step4100/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:791541856934b608332f89404414aa6812d82a818052289589ae21c1b1b0ec9f
3
+ size 781993509
last-checkpoint/global_step4100/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c2cbbf173e016d84a36966c4ae3c102fc4756b8d9faaa3ddf6008f53e9b95ee1
3
+ size 2610290277
last-checkpoint/latest CHANGED
@@ -1 +1 @@
1
- global_step4000
 
1
+ global_step4100
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7738b79cde91732aa1ae36546c20e2adfb138db06ede459f3546964f4c72f003
3
  size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3bbc3d1660e77580d684add9546e5fe8bd6fc84071100e9a520c41d938330a79
3
  size 15429
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8c87bb0bbd4a5d934e9e0ee64426668f65a3c0671e53f80788bd09202aaa80ce
3
  size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:00d7703de6e560ffbbf010cbfffa20522d7da9ff9f4719e1064c19461079ea48
3
  size 15429
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3438bbb08774094f199cd5833a18b6fec0ce5cda0f318f97029e7d59620cafc6
3
  size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:204484b4dccae23b095603bfb2d8fc482440509c028607bd9556adf092617aac
3
  size 15429
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4893134b5c11d042dab70821374bd20a7f7800fefcc8fad1ea78520c80bfcce6
3
  size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0fceb540dfe7b45fc0da1b3cdddd6d3f71c61414fae78500c040a17afd7ae2e
3
  size 15429
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7278ee28e675006b1a18eabb528c5e753ec5c79a4c5c843c134b5fc72246eac3
3
  size 1401
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59fba9955671eaa664ef7e8ac2aec090cfd8274510ae38341a2658c4438b5bf0
3
  size 1401
last-checkpoint/trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
  "best_global_step": null,
3
- "best_metric": 1.9395991563796997,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.5814798662596308,
6
  "eval_steps": 50,
7
- "global_step": 4000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -6248,6 +6248,162 @@
6248
  "eval_samples_per_second": 172.663,
6249
  "eval_steps_per_second": 10.827,
6250
  "step": 4000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6251
  }
6252
  ],
6253
  "logging_steps": 5,
@@ -6276,7 +6432,7 @@
6276
  "attributes": {}
6277
  }
6278
  },
6279
- "total_flos": 1.0434609863437844e+18,
6280
  "train_batch_size": 4,
6281
  "trial_name": null,
6282
  "trial_params": null
 
1
  {
2
  "best_global_step": null,
3
+ "best_metric": 1.923519253730774,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.5960168629161215,
6
  "eval_steps": 50,
7
+ "global_step": 4100,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
6248
  "eval_samples_per_second": 172.663,
6249
  "eval_steps_per_second": 10.827,
6250
  "step": 4000
6251
+ },
6252
+ {
6253
+ "epoch": 0.5822067160924553,
6254
+ "grad_norm": 2.408169984817505,
6255
+ "learning_rate": 6.642383155912741e-05,
6256
+ "loss": 2.133,
6257
+ "step": 4005
6258
+ },
6259
+ {
6260
+ "epoch": 0.5829335659252798,
6261
+ "grad_norm": 2.9105172157287598,
6262
+ "learning_rate": 6.634866071919054e-05,
6263
+ "loss": 2.124,
6264
+ "step": 4010
6265
+ },
6266
+ {
6267
+ "epoch": 0.5836604157581043,
6268
+ "grad_norm": 2.630783796310425,
6269
+ "learning_rate": 6.627344896916006e-05,
6270
+ "loss": 2.1179,
6271
+ "step": 4015
6272
+ },
6273
+ {
6274
+ "epoch": 0.5843872655909289,
6275
+ "grad_norm": 2.399688482284546,
6276
+ "learning_rate": 6.619819649838151e-05,
6277
+ "loss": 2.1174,
6278
+ "step": 4020
6279
+ },
6280
+ {
6281
+ "epoch": 0.5851141154237535,
6282
+ "grad_norm": 2.5117554664611816,
6283
+ "learning_rate": 6.612290349630285e-05,
6284
+ "loss": 2.0063,
6285
+ "step": 4025
6286
+ },
6287
+ {
6288
+ "epoch": 0.585840965256578,
6289
+ "grad_norm": 2.6324381828308105,
6290
+ "learning_rate": 6.604757015247416e-05,
6291
+ "loss": 2.057,
6292
+ "step": 4030
6293
+ },
6294
+ {
6295
+ "epoch": 0.5865678150894025,
6296
+ "grad_norm": 2.442852258682251,
6297
+ "learning_rate": 6.597219665654702e-05,
6298
+ "loss": 1.933,
6299
+ "step": 4035
6300
+ },
6301
+ {
6302
+ "epoch": 0.587294664922227,
6303
+ "grad_norm": 2.4938302040100098,
6304
+ "learning_rate": 6.589678319827412e-05,
6305
+ "loss": 2.2347,
6306
+ "step": 4040
6307
+ },
6308
+ {
6309
+ "epoch": 0.5880215147550516,
6310
+ "grad_norm": 2.2091469764709473,
6311
+ "learning_rate": 6.582132996750874e-05,
6312
+ "loss": 2.0614,
6313
+ "step": 4045
6314
+ },
6315
+ {
6316
+ "epoch": 0.5887483645878762,
6317
+ "grad_norm": 2.2665116786956787,
6318
+ "learning_rate": 6.574583715420433e-05,
6319
+ "loss": 2.085,
6320
+ "step": 4050
6321
+ },
6322
+ {
6323
+ "epoch": 0.5887483645878762,
6324
+ "eval_loss": 1.9283087253570557,
6325
+ "eval_runtime": 21.1511,
6326
+ "eval_samples_per_second": 156.068,
6327
+ "eval_steps_per_second": 9.787,
6328
+ "step": 4050
6329
+ },
6330
+ {
6331
+ "epoch": 0.5894752144207007,
6332
+ "grad_norm": 2.5516645908355713,
6333
+ "learning_rate": 6.567030494841393e-05,
6334
+ "loss": 2.1021,
6335
+ "step": 4055
6336
+ },
6337
+ {
6338
+ "epoch": 0.5902020642535252,
6339
+ "grad_norm": 2.4371495246887207,
6340
+ "learning_rate": 6.559473354028979e-05,
6341
+ "loss": 2.0655,
6342
+ "step": 4060
6343
+ },
6344
+ {
6345
+ "epoch": 0.5909289140863497,
6346
+ "grad_norm": 2.0865836143493652,
6347
+ "learning_rate": 6.551912312008285e-05,
6348
+ "loss": 2.1788,
6349
+ "step": 4065
6350
+ },
6351
+ {
6352
+ "epoch": 0.5916557639191743,
6353
+ "grad_norm": 2.408687114715576,
6354
+ "learning_rate": 6.544347387814224e-05,
6355
+ "loss": 2.1187,
6356
+ "step": 4070
6357
+ },
6358
+ {
6359
+ "epoch": 0.5923826137519989,
6360
+ "grad_norm": 2.4930145740509033,
6361
+ "learning_rate": 6.536778600491481e-05,
6362
+ "loss": 2.2741,
6363
+ "step": 4075
6364
+ },
6365
+ {
6366
+ "epoch": 0.5931094635848234,
6367
+ "grad_norm": 2.3992059230804443,
6368
+ "learning_rate": 6.529205969094474e-05,
6369
+ "loss": 1.9715,
6370
+ "step": 4080
6371
+ },
6372
+ {
6373
+ "epoch": 0.5938363134176479,
6374
+ "grad_norm": 2.214466094970703,
6375
+ "learning_rate": 6.521629512687291e-05,
6376
+ "loss": 2.1169,
6377
+ "step": 4085
6378
+ },
6379
+ {
6380
+ "epoch": 0.5945631632504724,
6381
+ "grad_norm": 2.3627679347991943,
6382
+ "learning_rate": 6.514049250343653e-05,
6383
+ "loss": 1.9602,
6384
+ "step": 4090
6385
+ },
6386
+ {
6387
+ "epoch": 0.595290013083297,
6388
+ "grad_norm": 2.594008684158325,
6389
+ "learning_rate": 6.506465201146858e-05,
6390
+ "loss": 2.1459,
6391
+ "step": 4095
6392
+ },
6393
+ {
6394
+ "epoch": 0.5960168629161215,
6395
+ "grad_norm": 1.9788795709609985,
6396
+ "learning_rate": 6.498877384189746e-05,
6397
+ "loss": 1.898,
6398
+ "step": 4100
6399
+ },
6400
+ {
6401
+ "epoch": 0.5960168629161215,
6402
+ "eval_loss": 1.923519253730774,
6403
+ "eval_runtime": 18.9492,
6404
+ "eval_samples_per_second": 174.203,
6405
+ "eval_steps_per_second": 10.924,
6406
+ "step": 4100
6407
  }
6408
  ],
6409
  "logging_steps": 5,
 
6432
  "attributes": {}
6433
  }
6434
  },
6435
+ "total_flos": 1.06971867773508e+18,
6436
  "train_batch_size": 4,
6437
  "trial_name": null,
6438
  "trial_params": null