amos1088 commited on
Commit
7c50dd6
·
verified ·
1 Parent(s): 3072a79

Training in progress, step 600, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cc89b84ff994a416b1fd8a35db8384b6b9a40381f903264827d17ecdff5f45be
3
  size 35668592
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ddc2f6d25f763b9de0ed5430306eb636ad7c54269a7b3b70c998dbb4d0242450
3
  size 35668592
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d6ffe29712d5eeee8f43f601ab13dd23dc7df7e12fb41454b57276f9b97c3680
3
  size 18257163
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d46dc4e7f4c38bd7d740c9f5afd10a56d0c7b90d973e3e7ceaf2a89f6ab3066a
3
  size 18257163
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8a17d91ff6dcca4633791a0e119c48601550130760f9eabb15146d59647aafb1
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:532727a1ac4eb5b9846bd900afbac875d546089027ad66d97c611355ff543eb1
3
  size 14645
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0c680537e123ff05619f00235a6bb4e1115b680be2ad94388dedf9dffc0968a0
3
  size 1383
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:20260be3fc45a3cfa8fd6a74639f50b3b33a87c97c47f472437044dfb3488bc9
3
  size 1383
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1024392986cac23a001e3e4a426a85b67203cda9404b22609e539557db80bbac
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02647c16a79c538141d09a2e5ec5135201f004952aab2cef2e8f97c0a0eb658e
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.15043249341857842,
6
  "eval_steps": 500,
7
- "global_step": 500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -458,6 +458,96 @@
458
  "mean_token_accuracy": 0.7375,
459
  "num_tokens": 1736202.0,
460
  "step": 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
461
  }
462
  ],
463
  "logging_steps": 10,
@@ -477,7 +567,7 @@
477
  "attributes": {}
478
  }
479
  },
480
- "total_flos": 3.887174157355008e+16,
481
  "train_batch_size": 1,
482
  "trial_name": null,
483
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.1805189921022941,
6
  "eval_steps": 500,
7
+ "global_step": 600,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
458
  "mean_token_accuracy": 0.7375,
459
  "num_tokens": 1736202.0,
460
  "step": 500
461
+ },
462
+ {
463
+ "epoch": 0.15344114328695,
464
+ "grad_norm": 0.0044481828808784485,
465
+ "learning_rate": 2.530060120240481e-05,
466
+ "loss": 0.0004,
467
+ "mean_token_accuracy": 0.6,
468
+ "num_tokens": 1770561.0,
469
+ "step": 510
470
+ },
471
+ {
472
+ "epoch": 0.15644979315532154,
473
+ "grad_norm": 0.0005716494051739573,
474
+ "learning_rate": 2.5801603206412827e-05,
475
+ "loss": 0.0005,
476
+ "mean_token_accuracy": 0.625,
477
+ "num_tokens": 1804749.0,
478
+ "step": 520
479
+ },
480
+ {
481
+ "epoch": 0.1594584430236931,
482
+ "grad_norm": 0.00020559463882818818,
483
+ "learning_rate": 2.6302605210420845e-05,
484
+ "loss": 0.002,
485
+ "mean_token_accuracy": 0.5375,
486
+ "num_tokens": 1839027.0,
487
+ "step": 530
488
+ },
489
+ {
490
+ "epoch": 0.16246709289206468,
491
+ "grad_norm": 0.000684644328430295,
492
+ "learning_rate": 2.6803607214428862e-05,
493
+ "loss": 0.0012,
494
+ "mean_token_accuracy": 0.6375,
495
+ "num_tokens": 1872641.0,
496
+ "step": 540
497
+ },
498
+ {
499
+ "epoch": 0.16547574276043625,
500
+ "grad_norm": 0.0008667957736179233,
501
+ "learning_rate": 2.730460921843688e-05,
502
+ "loss": 0.0011,
503
+ "mean_token_accuracy": 0.5375,
504
+ "num_tokens": 1908397.0,
505
+ "step": 550
506
+ },
507
+ {
508
+ "epoch": 0.16848439262880782,
509
+ "grad_norm": 0.0003756976220756769,
510
+ "learning_rate": 2.780561122244489e-05,
511
+ "loss": 0.0087,
512
+ "mean_token_accuracy": 0.675,
513
+ "num_tokens": 1942186.0,
514
+ "step": 560
515
+ },
516
+ {
517
+ "epoch": 0.1714930424971794,
518
+ "grad_norm": 0.0013846313813701272,
519
+ "learning_rate": 2.8306613226452906e-05,
520
+ "loss": 0.0006,
521
+ "mean_token_accuracy": 0.625,
522
+ "num_tokens": 1976984.0,
523
+ "step": 570
524
+ },
525
+ {
526
+ "epoch": 0.17450169236555096,
527
+ "grad_norm": 0.0010639706160873175,
528
+ "learning_rate": 2.880761523046092e-05,
529
+ "loss": 0.0003,
530
+ "mean_token_accuracy": 0.6375,
531
+ "num_tokens": 2011487.0,
532
+ "step": 580
533
+ },
534
+ {
535
+ "epoch": 0.17751034223392254,
536
+ "grad_norm": 0.002905157394707203,
537
+ "learning_rate": 2.9308617234468937e-05,
538
+ "loss": 0.0004,
539
+ "mean_token_accuracy": 0.5625,
540
+ "num_tokens": 2047066.0,
541
+ "step": 590
542
+ },
543
+ {
544
+ "epoch": 0.1805189921022941,
545
+ "grad_norm": 0.0001582392433192581,
546
+ "learning_rate": 2.9809619238476955e-05,
547
+ "loss": 0.0004,
548
+ "mean_token_accuracy": 0.6875,
549
+ "num_tokens": 2081518.0,
550
+ "step": 600
551
  }
552
  ],
553
  "logging_steps": 10,
 
567
  "attributes": {}
568
  }
569
  },
570
+ "total_flos": 4.660300459087872e+16,
571
  "train_batch_size": 1,
572
  "trial_name": null,
573
  "trial_params": null