ljcamargo commited on
Commit
6242e45
·
verified ·
1 Parent(s): 27e6aa4

Training in progress, step 900, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:70bb5bb361cb2c44a3c95065d77a422ed9649be5bd191a41e78d34e35834847b
3
  size 3237829088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12451cbc3f681c4d652adfbac061b94a617fe5288eadb9338e0c0116f1ab09a0
3
  size 3237829088
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:098af36ab395a778e4ad67ddfa0191cced4901cc9518f1a83364c1e0ed5e1dbe
3
  size 2062272049
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e2ac56c591f7d766084f7d7411df625b2b9385404ec3f7c675d4dea7dc7e6caf
3
  size 2062272049
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6e4743b658fa9de4e54f29c9ad2e962e40b252677e30a13b5a792846f596864f
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5dbfcd19dd04e87ea5a0a02be9024658d9d950751b047080030098d848f7be93
3
  size 14645
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1e35963fbe17703d43e57c264c8bf401c049828d6ea5abe6c269f936eebec007
3
  size 1383
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:504b7bc543b9e5f039f6559d07b099507a66c15c86836ff5981e4eee51792c02
3
  size 1383
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0bb282d448dcf74bd6fbccf99dc933faaae6d52cfc91f1d2df7df3c6a133ab1a
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb4a875674808f59301dfb232d0f66520f4ad75d8962141e97389a5c2ef15def
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.17219113215669393,
6
  "eval_steps": 300,
7
- "global_step": 600,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -428,6 +428,216 @@
428
  "learning_rate": 0.00018848305761935797,
429
  "loss": 0.9528,
430
  "step": 600
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
431
  }
432
  ],
433
  "logging_steps": 10,
@@ -447,7 +657,7 @@
447
  "attributes": {}
448
  }
449
  },
450
- "total_flos": 2.4551296598016e+19,
451
  "train_batch_size": 6,
452
  "trial_name": null,
453
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.2582866982350409,
6
  "eval_steps": 300,
7
+ "global_step": 900,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
428
  "learning_rate": 0.00018848305761935797,
429
  "loss": 0.9528,
430
  "step": 600
431
+ },
432
+ {
433
+ "epoch": 0.1750609843593055,
434
+ "grad_norm": 9.415815353393555,
435
+ "learning_rate": 0.00018805083564044802,
436
+ "loss": 0.8619,
437
+ "step": 610
438
+ },
439
+ {
440
+ "epoch": 0.17793083656191705,
441
+ "grad_norm": 9.250490188598633,
442
+ "learning_rate": 0.0001876111664371025,
443
+ "loss": 0.9168,
444
+ "step": 620
445
+ },
446
+ {
447
+ "epoch": 0.18080068876452862,
448
+ "grad_norm": 15.730814933776855,
449
+ "learning_rate": 0.0001871640871959672,
450
+ "loss": 0.94,
451
+ "step": 630
452
+ },
453
+ {
454
+ "epoch": 0.1836705409671402,
455
+ "grad_norm": 9.073026657104492,
456
+ "learning_rate": 0.0001867096357304191,
457
+ "loss": 0.9471,
458
+ "step": 640
459
+ },
460
+ {
461
+ "epoch": 0.18654039316975177,
462
+ "grad_norm": 8.982126235961914,
463
+ "learning_rate": 0.00018624785047736842,
464
+ "loss": 0.9177,
465
+ "step": 650
466
+ },
467
+ {
468
+ "epoch": 0.1894102453723633,
469
+ "grad_norm": 10.682122230529785,
470
+ "learning_rate": 0.00018577877049400746,
471
+ "loss": 0.9402,
472
+ "step": 660
473
+ },
474
+ {
475
+ "epoch": 0.19228009757497488,
476
+ "grad_norm": 8.706944465637207,
477
+ "learning_rate": 0.0001853024354545073,
478
+ "loss": 0.8867,
479
+ "step": 670
480
+ },
481
+ {
482
+ "epoch": 0.19514994977758646,
483
+ "grad_norm": 5.8472371101379395,
484
+ "learning_rate": 0.00018481888564666208,
485
+ "loss": 0.9135,
486
+ "step": 680
487
+ },
488
+ {
489
+ "epoch": 0.19801980198019803,
490
+ "grad_norm": 5.432713508605957,
491
+ "learning_rate": 0.00018432816196848172,
492
+ "loss": 0.8525,
493
+ "step": 690
494
+ },
495
+ {
496
+ "epoch": 0.20088965418280957,
497
+ "grad_norm": 28.993038177490234,
498
+ "learning_rate": 0.00018383030592473266,
499
+ "loss": 0.8779,
500
+ "step": 700
501
+ },
502
+ {
503
+ "epoch": 0.20375950638542115,
504
+ "grad_norm": 5.313049793243408,
505
+ "learning_rate": 0.0001833253596234274,
506
+ "loss": 0.9551,
507
+ "step": 710
508
+ },
509
+ {
510
+ "epoch": 0.20662935858803272,
511
+ "grad_norm": 18.639175415039062,
512
+ "learning_rate": 0.00018281336577226327,
513
+ "loss": 0.8694,
514
+ "step": 720
515
+ },
516
+ {
517
+ "epoch": 0.2094992107906443,
518
+ "grad_norm": 15.578129768371582,
519
+ "learning_rate": 0.00018229436767501012,
520
+ "loss": 0.9017,
521
+ "step": 730
522
+ },
523
+ {
524
+ "epoch": 0.21236906299325584,
525
+ "grad_norm": 18.0419864654541,
526
+ "learning_rate": 0.0001817684092278477,
527
+ "loss": 0.8616,
528
+ "step": 740
529
+ },
530
+ {
531
+ "epoch": 0.2152389151958674,
532
+ "grad_norm": 8.34323787689209,
533
+ "learning_rate": 0.00018123553491565308,
534
+ "loss": 0.8902,
535
+ "step": 750
536
+ },
537
+ {
538
+ "epoch": 0.21810876739847898,
539
+ "grad_norm": 8.49802017211914,
540
+ "learning_rate": 0.00018069578980823816,
541
+ "loss": 0.8781,
542
+ "step": 760
543
+ },
544
+ {
545
+ "epoch": 0.22097861960109055,
546
+ "grad_norm": 6.250750541687012,
547
+ "learning_rate": 0.00018014921955653772,
548
+ "loss": 0.8405,
549
+ "step": 770
550
+ },
551
+ {
552
+ "epoch": 0.2238484718037021,
553
+ "grad_norm": 25.283082962036133,
554
+ "learning_rate": 0.00017959587038874822,
555
+ "loss": 0.93,
556
+ "step": 780
557
+ },
558
+ {
559
+ "epoch": 0.22671832400631367,
560
+ "grad_norm": 18.443071365356445,
561
+ "learning_rate": 0.00017903578910641814,
562
+ "loss": 0.9202,
563
+ "step": 790
564
+ },
565
+ {
566
+ "epoch": 0.22958817620892524,
567
+ "grad_norm": 18.457555770874023,
568
+ "learning_rate": 0.0001784690230804892,
569
+ "loss": 0.9446,
570
+ "step": 800
571
+ },
572
+ {
573
+ "epoch": 0.23245802841153682,
574
+ "grad_norm": 7.786270618438721,
575
+ "learning_rate": 0.00017789562024729012,
576
+ "loss": 0.899,
577
+ "step": 810
578
+ },
579
+ {
580
+ "epoch": 0.23532788061414836,
581
+ "grad_norm": 6.527904033660889,
582
+ "learning_rate": 0.00017731562910448202,
583
+ "loss": 0.8866,
584
+ "step": 820
585
+ },
586
+ {
587
+ "epoch": 0.23819773281675993,
588
+ "grad_norm": 8.394437789916992,
589
+ "learning_rate": 0.00017672909870695665,
590
+ "loss": 0.8749,
591
+ "step": 830
592
+ },
593
+ {
594
+ "epoch": 0.2410675850193715,
595
+ "grad_norm": 6.815917491912842,
596
+ "learning_rate": 0.00017613607866268742,
597
+ "loss": 0.8542,
598
+ "step": 840
599
+ },
600
+ {
601
+ "epoch": 0.24393743722198308,
602
+ "grad_norm": 16.42218780517578,
603
+ "learning_rate": 0.00017553661912853347,
604
+ "loss": 0.8658,
605
+ "step": 850
606
+ },
607
+ {
608
+ "epoch": 0.24680728942459462,
609
+ "grad_norm": 14.373140335083008,
610
+ "learning_rate": 0.00017493077080599768,
611
+ "loss": 0.8756,
612
+ "step": 860
613
+ },
614
+ {
615
+ "epoch": 0.2496771416272062,
616
+ "grad_norm": 17.368059158325195,
617
+ "learning_rate": 0.0001743185849369381,
618
+ "loss": 0.9572,
619
+ "step": 870
620
+ },
621
+ {
622
+ "epoch": 0.25254699382981777,
623
+ "grad_norm": 8.744333267211914,
624
+ "learning_rate": 0.0001737001132992344,
625
+ "loss": 0.8743,
626
+ "step": 880
627
+ },
628
+ {
629
+ "epoch": 0.2554168460324293,
630
+ "grad_norm": 9.240042686462402,
631
+ "learning_rate": 0.0001730754082024082,
632
+ "loss": 0.8666,
633
+ "step": 890
634
+ },
635
+ {
636
+ "epoch": 0.2582866982350409,
637
+ "grad_norm": 8.81686782836914,
638
+ "learning_rate": 0.00017244452248319896,
639
+ "loss": 0.8771,
640
+ "step": 900
641
  }
642
  ],
643
  "logging_steps": 10,
 
657
  "attributes": {}
658
  }
659
  },
660
+ "total_flos": 3.6826944897024e+19,
661
  "train_batch_size": 6,
662
  "trial_name": null,
663
  "trial_params": null