ljcamargo commited on
Commit
33957c2
·
verified ·
1 Parent(s): 465465b

Training in progress, step 900, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f8914facca3ec7ebd4ca0af63a4103bd73934c6203de2086fcb50395772ac962
3
  size 3237818848
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea44be5f29e63d43296d9d83bd74000d9eec25472608a721883a3def330d0d51
3
  size 3237818848
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0950e188b2932851adfd96a6948dca2e97b8e9815befb943767ad300ac5bddf4
3
  size 2062251569
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fcfc52b46b2bcbd19bdeae44612f8466c1fd2dddd02666025d9a6d924a564419
3
  size 2062251569
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7087649df6c0734a2a4d59d344e34355cbcef9bd4b101d7b7a1da6a37d115851
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60c8632974dc900245d4dfbbcf87a13b532e38345500a34dea8a1b480b697112
3
  size 14645
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1e35963fbe17703d43e57c264c8bf401c049828d6ea5abe6c269f936eebec007
3
  size 1383
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:504b7bc543b9e5f039f6559d07b099507a66c15c86836ff5981e4eee51792c02
3
  size 1383
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:acea6b741bab97301e556cecda1616269a490b6124f19e7710f2f8643bc308f4
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a838d3ba3633bb04603e3afbc02ea3103b4064d4c633a0639c7ced656d5b0c92
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.26666666666666666,
6
  "eval_steps": 300,
7
- "global_step": 600,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -440,6 +440,216 @@
440
  "learning_rate": 0.00017066031020892934,
441
  "loss": 1.1963,
442
  "step": 600
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
443
  }
444
  ],
445
  "logging_steps": 10,
@@ -459,7 +669,7 @@
459
  "attributes": {}
460
  }
461
  },
462
- "total_flos": 1.6367531065344e+19,
463
  "train_batch_size": 4,
464
  "trial_name": null,
465
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.4,
6
  "eval_steps": 300,
7
+ "global_step": 900,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
440
  "learning_rate": 0.00017066031020892934,
441
  "loss": 1.1963,
442
  "step": 600
443
+ },
444
+ {
445
+ "epoch": 0.27111111111111114,
446
+ "grad_norm": 22.651229858398438,
447
+ "learning_rate": 0.00016964546254930247,
448
+ "loss": 1.1826,
449
+ "step": 610
450
+ },
451
+ {
452
+ "epoch": 0.27555555555555555,
453
+ "grad_norm": 28.09796905517578,
454
+ "learning_rate": 0.0001686164903528152,
455
+ "loss": 1.2849,
456
+ "step": 620
457
+ },
458
+ {
459
+ "epoch": 0.28,
460
+ "grad_norm": 22.29288673400879,
461
+ "learning_rate": 0.00016757360230148618,
462
+ "loss": 1.2473,
463
+ "step": 630
464
+ },
465
+ {
466
+ "epoch": 0.28444444444444444,
467
+ "grad_norm": 29.369836807250977,
468
+ "learning_rate": 0.00016651700989955682,
469
+ "loss": 1.2133,
470
+ "step": 640
471
+ },
472
+ {
473
+ "epoch": 0.28888888888888886,
474
+ "grad_norm": 10.829903602600098,
475
+ "learning_rate": 0.00016544692743059684,
476
+ "loss": 1.2379,
477
+ "step": 650
478
+ },
479
+ {
480
+ "epoch": 0.29333333333333333,
481
+ "grad_norm": 10.857136726379395,
482
+ "learning_rate": 0.0001643635719140461,
483
+ "loss": 1.1889,
484
+ "step": 660
485
+ },
486
+ {
487
+ "epoch": 0.29777777777777775,
488
+ "grad_norm": 10.762494087219238,
489
+ "learning_rate": 0.00016326716306120195,
490
+ "loss": 1.2591,
491
+ "step": 670
492
+ },
493
+ {
494
+ "epoch": 0.3022222222222222,
495
+ "grad_norm": 13.926369667053223,
496
+ "learning_rate": 0.00016215792323066012,
497
+ "loss": 1.2075,
498
+ "step": 680
499
+ },
500
+ {
501
+ "epoch": 0.30666666666666664,
502
+ "grad_norm": 10.43800163269043,
503
+ "learning_rate": 0.00016103607738321925,
504
+ "loss": 1.2076,
505
+ "step": 690
506
+ },
507
+ {
508
+ "epoch": 0.3111111111111111,
509
+ "grad_norm": 18.675508499145508,
510
+ "learning_rate": 0.0001599018530362573,
511
+ "loss": 1.2339,
512
+ "step": 700
513
+ },
514
+ {
515
+ "epoch": 0.31555555555555553,
516
+ "grad_norm": 10.020101547241211,
517
+ "learning_rate": 0.0001587554802175895,
518
+ "loss": 1.14,
519
+ "step": 710
520
+ },
521
+ {
522
+ "epoch": 0.32,
523
+ "grad_norm": 8.47937297821045,
524
+ "learning_rate": 0.0001575971914188175,
525
+ "loss": 1.145,
526
+ "step": 720
527
+ },
528
+ {
529
+ "epoch": 0.3244444444444444,
530
+ "grad_norm": 16.2773380279541,
531
+ "learning_rate": 0.00015642722154817848,
532
+ "loss": 1.1076,
533
+ "step": 730
534
+ },
535
+ {
536
+ "epoch": 0.3288888888888889,
537
+ "grad_norm": 10.47890853881836,
538
+ "learning_rate": 0.00015524580788290425,
539
+ "loss": 1.1414,
540
+ "step": 740
541
+ },
542
+ {
543
+ "epoch": 0.3333333333333333,
544
+ "grad_norm": 9.702156066894531,
545
+ "learning_rate": 0.0001540531900211,
546
+ "loss": 1.22,
547
+ "step": 750
548
+ },
549
+ {
550
+ "epoch": 0.3377777777777778,
551
+ "grad_norm": 10.579848289489746,
552
+ "learning_rate": 0.0001528496098331523,
553
+ "loss": 1.1548,
554
+ "step": 760
555
+ },
556
+ {
557
+ "epoch": 0.3422222222222222,
558
+ "grad_norm": 24.226659774780273,
559
+ "learning_rate": 0.00015163531141267628,
560
+ "loss": 1.1407,
561
+ "step": 770
562
+ },
563
+ {
564
+ "epoch": 0.3466666666666667,
565
+ "grad_norm": 11.10332202911377,
566
+ "learning_rate": 0.00015041054102701184,
567
+ "loss": 1.1642,
568
+ "step": 780
569
+ },
570
+ {
571
+ "epoch": 0.3511111111111111,
572
+ "grad_norm": 14.13973331451416,
573
+ "learning_rate": 0.00014917554706727915,
574
+ "loss": 1.1726,
575
+ "step": 790
576
+ },
577
+ {
578
+ "epoch": 0.35555555555555557,
579
+ "grad_norm": 21.75472640991211,
580
+ "learning_rate": 0.00014793057999800335,
581
+ "loss": 1.1478,
582
+ "step": 800
583
+ },
584
+ {
585
+ "epoch": 0.36,
586
+ "grad_norm": 12.123833656311035,
587
+ "learning_rate": 0.0001466758923063189,
588
+ "loss": 1.1939,
589
+ "step": 810
590
+ },
591
+ {
592
+ "epoch": 0.36444444444444446,
593
+ "grad_norm": 9.510560035705566,
594
+ "learning_rate": 0.00014541173845076323,
595
+ "loss": 1.0843,
596
+ "step": 820
597
+ },
598
+ {
599
+ "epoch": 0.3688888888888889,
600
+ "grad_norm": 17.031314849853516,
601
+ "learning_rate": 0.00014413837480967145,
602
+ "loss": 1.1181,
603
+ "step": 830
604
+ },
605
+ {
606
+ "epoch": 0.37333333333333335,
607
+ "grad_norm": 16.022037506103516,
608
+ "learning_rate": 0.00014285605962918084,
609
+ "loss": 1.1542,
610
+ "step": 840
611
+ },
612
+ {
613
+ "epoch": 0.37777777777777777,
614
+ "grad_norm": 12.77236270904541,
615
+ "learning_rate": 0.00014156505297085713,
616
+ "loss": 1.114,
617
+ "step": 850
618
+ },
619
+ {
620
+ "epoch": 0.38222222222222224,
621
+ "grad_norm": 38.819454193115234,
622
+ "learning_rate": 0.00014026561665895224,
623
+ "loss": 1.0932,
624
+ "step": 860
625
+ },
626
+ {
627
+ "epoch": 0.38666666666666666,
628
+ "grad_norm": 10.85486125946045,
629
+ "learning_rate": 0.00013895801422730473,
630
+ "loss": 1.11,
631
+ "step": 870
632
+ },
633
+ {
634
+ "epoch": 0.39111111111111113,
635
+ "grad_norm": 10.986682891845703,
636
+ "learning_rate": 0.00013764251086589353,
637
+ "loss": 1.0752,
638
+ "step": 880
639
+ },
640
+ {
641
+ "epoch": 0.39555555555555555,
642
+ "grad_norm": 7.950289726257324,
643
+ "learning_rate": 0.00013631937336705568,
644
+ "loss": 1.1817,
645
+ "step": 890
646
+ },
647
+ {
648
+ "epoch": 0.4,
649
+ "grad_norm": 7.103327751159668,
650
+ "learning_rate": 0.00013498887007137918,
651
+ "loss": 1.0813,
652
+ "step": 900
653
  }
654
  ],
655
  "logging_steps": 10,
 
669
  "attributes": {}
670
  }
671
  },
672
+ "total_flos": 2.4551296598016e+19,
673
  "train_batch_size": 4,
674
  "trial_name": null,
675
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:90a266fde3aed1b9627604c89937e8d2ff74c90016b7e05cb2a1d6ffdc03917d
3
  size 5969
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b9012ea34a655f218dd2bab2f2edc38d537cc1525f0e422dbcceaa6fa5a75517
3
  size 5969