rakhman-llm commited on
Commit
88ea7c8
·
verified ·
1 Parent(s): 61eb493

Training in progress, step 3500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c8e4a377d9d9b4ee3182c91d09c6aa160ff1ade127f1e240b0ace4f4312419ec
3
  size 891558696
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f249b4c12d0314b83435dfe63816bee13e5910dc2b4ef014ba7a61e89f43ac0
3
  size 891558696
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:37c76f78e4bea0bb233f5490f2342fc733388c7761a0fa0c0e5fdf8f1a5336d2
3
  size 1783272762
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d2341c66cd39e43b1d806239932be769b677cdef8e280c9b4ff34c8958b0ad34
3
  size 1783272762
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f180d57072cebd56f1e36f710e6b62868e2b14fe85aee7effc0a0d28a6763011
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dbe26b37d45ea8d0357f9b26f439f5ad172b1a279f1b8765178fa166fce80cbc
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:19fa64c0f058dbaac84e2a6129da56913abd2f29f4a3f61f13f6abfb2cd3ff5f
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9c9a3885ae8ac27be8fa78a1d765dfbf434202614784e058d9c43f9ea39114b
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 0.08551913499832153,
3
- "best_model_checkpoint": "./fine-tuned/checkpoint-3000",
4
- "epoch": 0.48,
5
  "eval_steps": 500,
6
- "global_step": 3000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -81,9 +81,9 @@
81
  {
82
  "epoch": 0.08,
83
  "eval_loss": 0.09235642850399017,
84
- "eval_runtime": 116.7651,
85
- "eval_samples_per_second": 17.128,
86
- "eval_steps_per_second": 2.141,
87
  "step": 500
88
  },
89
  {
@@ -159,9 +159,9 @@
159
  {
160
  "epoch": 0.16,
161
  "eval_loss": 0.08949962258338928,
162
- "eval_runtime": 116.7407,
163
- "eval_samples_per_second": 17.132,
164
- "eval_steps_per_second": 2.141,
165
  "step": 1000
166
  },
167
  {
@@ -237,9 +237,9 @@
237
  {
238
  "epoch": 0.24,
239
  "eval_loss": 0.08808805048465729,
240
- "eval_runtime": 116.8722,
241
- "eval_samples_per_second": 17.113,
242
- "eval_steps_per_second": 2.139,
243
  "step": 1500
244
  },
245
  {
@@ -315,9 +315,9 @@
315
  {
316
  "epoch": 0.32,
317
  "eval_loss": 0.08704760670661926,
318
- "eval_runtime": 116.8362,
319
- "eval_samples_per_second": 17.118,
320
- "eval_steps_per_second": 2.14,
321
  "step": 2000
322
  },
323
  {
@@ -393,9 +393,9 @@
393
  {
394
  "epoch": 0.4,
395
  "eval_loss": 0.08615937829017639,
396
- "eval_runtime": 116.9591,
397
- "eval_samples_per_second": 17.1,
398
- "eval_steps_per_second": 2.137,
399
  "step": 2500
400
  },
401
  {
@@ -471,10 +471,88 @@
471
  {
472
  "epoch": 0.48,
473
  "eval_loss": 0.08551913499832153,
474
- "eval_runtime": 116.545,
475
- "eval_samples_per_second": 17.161,
476
- "eval_steps_per_second": 2.145,
477
  "step": 3000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
478
  }
479
  ],
480
  "logging_steps": 50,
@@ -494,7 +572,7 @@
494
  "attributes": {}
495
  }
496
  },
497
- "total_flos": 1.461498937344e+16,
498
  "train_batch_size": 8,
499
  "trial_name": null,
500
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.08540560305118561,
3
+ "best_model_checkpoint": "./fine-tuned/checkpoint-3500",
4
+ "epoch": 0.56,
5
  "eval_steps": 500,
6
+ "global_step": 3500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
81
  {
82
  "epoch": 0.08,
83
  "eval_loss": 0.09235642850399017,
84
+ "eval_runtime": 109.274,
85
+ "eval_samples_per_second": 18.303,
86
+ "eval_steps_per_second": 2.288,
87
  "step": 500
88
  },
89
  {
 
159
  {
160
  "epoch": 0.16,
161
  "eval_loss": 0.08949962258338928,
162
+ "eval_runtime": 109.2536,
163
+ "eval_samples_per_second": 18.306,
164
+ "eval_steps_per_second": 2.288,
165
  "step": 1000
166
  },
167
  {
 
237
  {
238
  "epoch": 0.24,
239
  "eval_loss": 0.08808805048465729,
240
+ "eval_runtime": 109.2355,
241
+ "eval_samples_per_second": 18.309,
242
+ "eval_steps_per_second": 2.289,
243
  "step": 1500
244
  },
245
  {
 
315
  {
316
  "epoch": 0.32,
317
  "eval_loss": 0.08704760670661926,
318
+ "eval_runtime": 109.4348,
319
+ "eval_samples_per_second": 18.276,
320
+ "eval_steps_per_second": 2.284,
321
  "step": 2000
322
  },
323
  {
 
393
  {
394
  "epoch": 0.4,
395
  "eval_loss": 0.08615937829017639,
396
+ "eval_runtime": 109.2621,
397
+ "eval_samples_per_second": 18.305,
398
+ "eval_steps_per_second": 2.288,
399
  "step": 2500
400
  },
401
  {
 
471
  {
472
  "epoch": 0.48,
473
  "eval_loss": 0.08551913499832153,
474
+ "eval_runtime": 109.2626,
475
+ "eval_samples_per_second": 18.305,
476
+ "eval_steps_per_second": 2.288,
477
  "step": 3000
478
+ },
479
+ {
480
+ "epoch": 0.488,
481
+ "grad_norm": 7060.21826171875,
482
+ "learning_rate": 2.5120000000000003e-05,
483
+ "loss": 0.0684,
484
+ "step": 3050
485
+ },
486
+ {
487
+ "epoch": 0.496,
488
+ "grad_norm": 7841.55322265625,
489
+ "learning_rate": 2.504e-05,
490
+ "loss": 0.0653,
491
+ "step": 3100
492
+ },
493
+ {
494
+ "epoch": 0.504,
495
+ "grad_norm": 5290.3271484375,
496
+ "learning_rate": 2.4959999999999998e-05,
497
+ "loss": 0.0668,
498
+ "step": 3150
499
+ },
500
+ {
501
+ "epoch": 0.512,
502
+ "grad_norm": 6200.4853515625,
503
+ "learning_rate": 2.4880000000000002e-05,
504
+ "loss": 0.0665,
505
+ "step": 3200
506
+ },
507
+ {
508
+ "epoch": 0.52,
509
+ "grad_norm": 6859.83544921875,
510
+ "learning_rate": 2.48e-05,
511
+ "loss": 0.0678,
512
+ "step": 3250
513
+ },
514
+ {
515
+ "epoch": 0.528,
516
+ "grad_norm": 7718.70068359375,
517
+ "learning_rate": 2.472e-05,
518
+ "loss": 0.0679,
519
+ "step": 3300
520
+ },
521
+ {
522
+ "epoch": 0.536,
523
+ "grad_norm": 10752.4873046875,
524
+ "learning_rate": 2.464e-05,
525
+ "loss": 0.062,
526
+ "step": 3350
527
+ },
528
+ {
529
+ "epoch": 0.544,
530
+ "grad_norm": 6991.5087890625,
531
+ "learning_rate": 2.456e-05,
532
+ "loss": 0.0659,
533
+ "step": 3400
534
+ },
535
+ {
536
+ "epoch": 0.552,
537
+ "grad_norm": 6204.99658203125,
538
+ "learning_rate": 2.448e-05,
539
+ "loss": 0.0636,
540
+ "step": 3450
541
+ },
542
+ {
543
+ "epoch": 0.56,
544
+ "grad_norm": 13521.5908203125,
545
+ "learning_rate": 2.44e-05,
546
+ "loss": 0.0671,
547
+ "step": 3500
548
+ },
549
+ {
550
+ "epoch": 0.56,
551
+ "eval_loss": 0.08540560305118561,
552
+ "eval_runtime": 109.3641,
553
+ "eval_samples_per_second": 18.288,
554
+ "eval_steps_per_second": 2.286,
555
+ "step": 3500
556
  }
557
  ],
558
  "logging_steps": 50,
 
572
  "attributes": {}
573
  }
574
  },
575
+ "total_flos": 1.705082093568e+16,
576
  "train_batch_size": 8,
577
  "trial_name": null,
578
  "trial_params": null