Training in progress, step 800, checkpoint

Browse files

Files changed (5) hide show

last-checkpoint/model.safetensors +1 -1
last-checkpoint/optimizer.pt +1 -1
last-checkpoint/rng_state.pth +1 -1
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/trainer_state.json +711 -3

last-checkpoint/model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:aa231a4fb18485169d08c9d1e7878f2c6c2747cf33272ebb7b91a615a73da69f
 size 373077376

 version https://git-lfs.github.com/spec/v1
+oid sha256:fd0e106749ec154eecd3ebb9fe7474cf3444291df427cce5d1d61cb2679e8088
 size 373077376

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ff95fd30c41364a06356f6550493cfc79f8b5f14e8279f05b156b0d50603cfb7
 size 422377675

 version https://git-lfs.github.com/spec/v1
+oid sha256:e6982c3f836b1c5917d64c3c2c07418fb1042a731f48fa53830cf50384b985a7
 size 422377675

last-checkpoint/rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:36f1c8cafda7ec05bcf717e4cbc9d475e180378b36391598d72523001d0947ee
 size 14645

 version https://git-lfs.github.com/spec/v1
+oid sha256:2933fa623da5d83a2ffe4eddaad982ac15c82f5c890445e228adf894e89f9290
 size 14645

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b636e22decc0690abb4217d3b016f329ae73b4d12bae4602c74bba0c4d4ffdc1
 size 1401

 version https://git-lfs.github.com/spec/v1
+oid sha256:4958966c61d8eed22eb0bdc6e0a1efc61ae912a801cb91fc7888b1951205081b
 size 1401

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -2,9 +2,9 @@
   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.726895119418484,
   "eval_steps": 100,
-  "global_step": 700,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -4964,6 +4964,714 @@
       "eval_samples_per_second": 9.72,
       "eval_steps_per_second": 1.215,
       "step": 700
     }
   ],
   "logging_steps": 1,
@@ -4983,7 +5691,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 8.91532735414272e+16,
   "train_batch_size": 16,
   "trial_name": null,
   "trial_params": null

   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.8307372793354102,
   "eval_steps": 100,
+  "global_step": 800,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "eval_samples_per_second": 9.72,
       "eval_steps_per_second": 1.215,
       "step": 700
+    },
+    {
+      "epoch": 0.7279335410176532,
+      "grad_norm": 1.7109375,
+      "learning_rate": 0.00019075678737450686,
+      "loss": 6.1547,
+      "step": 701
+    },
+    {
+      "epoch": 0.7289719626168224,
+      "grad_norm": 1.7734375,
+      "learning_rate": 0.00018940815215922607,
+      "loss": 6.0599,
+      "step": 702
+    },
+    {
+      "epoch": 0.7300103842159917,
+      "grad_norm": 1.6171875,
+      "learning_rate": 0.00018806318636018665,
+      "loss": 6.2195,
+      "step": 703
+    },
+    {
+      "epoch": 0.731048805815161,
+      "grad_norm": 1.46875,
+      "learning_rate": 0.00018672190586717908,
+      "loss": 6.4289,
+      "step": 704
+    },
+    {
+      "epoch": 0.7320872274143302,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.00018538432652645437,
+      "loss": 6.2451,
+      "step": 705
+    },
+    {
+      "epoch": 0.7331256490134995,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.00018405046414053728,
+      "loss": 6.2281,
+      "step": 706
+    },
+    {
+      "epoch": 0.7341640706126688,
+      "grad_norm": 1.484375,
+      "learning_rate": 0.00018272033446803949,
+      "loss": 6.2168,
+      "step": 707
+    },
+    {
+      "epoch": 0.735202492211838,
+      "grad_norm": 1.7890625,
+      "learning_rate": 0.00018139395322347334,
+      "loss": 6.1276,
+      "step": 708
+    },
+    {
+      "epoch": 0.7362409138110073,
+      "grad_norm": 1.4453125,
+      "learning_rate": 0.00018007133607706615,
+      "loss": 6.0438,
+      "step": 709
+    },
+    {
+      "epoch": 0.7372793354101765,
+      "grad_norm": 1.875,
+      "learning_rate": 0.00017875249865457527,
+      "loss": 6.0624,
+      "step": 710
+    },
+    {
+      "epoch": 0.7383177570093458,
+      "grad_norm": 1.671875,
+      "learning_rate": 0.00017743745653710336,
+      "loss": 6.4648,
+      "step": 711
+    },
+    {
+      "epoch": 0.7393561786085151,
+      "grad_norm": 1.4375,
+      "learning_rate": 0.00017612622526091403,
+      "loss": 6.2431,
+      "step": 712
+    },
+    {
+      "epoch": 0.7403946002076843,
+      "grad_norm": 1.375,
+      "learning_rate": 0.00017481882031724927,
+      "loss": 6.3375,
+      "step": 713
+    },
+    {
+      "epoch": 0.7414330218068536,
+      "grad_norm": 1.703125,
+      "learning_rate": 0.0001735152571521451,
+      "loss": 6.3156,
+      "step": 714
+    },
+    {
+      "epoch": 0.7424714434060229,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.00017221555116625,
+      "loss": 6.2417,
+      "step": 715
+    },
+    {
+      "epoch": 0.7435098650051921,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0001709197177146425,
+      "loss": 6.3143,
+      "step": 716
+    },
+    {
+      "epoch": 0.7445482866043613,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.0001696277721066502,
+      "loss": 6.1396,
+      "step": 717
+    },
+    {
+      "epoch": 0.7455867082035307,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.00016833972960566868,
+      "loss": 6.3164,
+      "step": 718
+    },
+    {
+      "epoch": 0.7466251298026999,
+      "grad_norm": 1.6484375,
+      "learning_rate": 0.00016705560542898051,
+      "loss": 6.2559,
+      "step": 719
+    },
+    {
+      "epoch": 0.7476635514018691,
+      "grad_norm": 1.5234375,
+      "learning_rate": 0.00016577541474757713,
+      "loss": 6.4104,
+      "step": 720
+    },
+    {
+      "epoch": 0.7487019730010385,
+      "grad_norm": 1.421875,
+      "learning_rate": 0.00016449917268597798,
+      "loss": 6.0631,
+      "step": 721
+    },
+    {
+      "epoch": 0.7497403946002077,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.00016322689432205252,
+      "loss": 6.3299,
+      "step": 722
+    },
+    {
+      "epoch": 0.7507788161993769,
+      "grad_norm": 1.5546875,
+      "learning_rate": 0.00016195859468684198,
+      "loss": 6.2053,
+      "step": 723
+    },
+    {
+      "epoch": 0.7518172377985463,
+      "grad_norm": 1.5546875,
+      "learning_rate": 0.00016069428876438202,
+      "loss": 6.3051,
+      "step": 724
+    },
+    {
+      "epoch": 0.7528556593977155,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.00015943399149152533,
+      "loss": 6.1548,
+      "step": 725
+    },
+    {
+      "epoch": 0.7538940809968847,
+      "grad_norm": 1.375,
+      "learning_rate": 0.00015817771775776507,
+      "loss": 6.2009,
+      "step": 726
+    },
+    {
+      "epoch": 0.754932502596054,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.00015692548240506,
+      "loss": 6.2028,
+      "step": 727
+    },
+    {
+      "epoch": 0.7559709241952233,
+      "grad_norm": 1.25,
+      "learning_rate": 0.00015567730022765752,
+      "loss": 5.9373,
+      "step": 728
+    },
+    {
+      "epoch": 0.7570093457943925,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0001544331859719202,
+      "loss": 6.1761,
+      "step": 729
+    },
+    {
+      "epoch": 0.7580477673935618,
+      "grad_norm": 1.484375,
+      "learning_rate": 0.000153193154336151,
+      "loss": 6.2549,
+      "step": 730
+    },
+    {
+      "epoch": 0.7590861889927311,
+      "grad_norm": 1.609375,
+      "learning_rate": 0.00015195721997041933,
+      "loss": 6.1982,
+      "step": 731
+    },
+    {
+      "epoch": 0.7601246105919003,
+      "grad_norm": 1.59375,
+      "learning_rate": 0.00015072539747638887,
+      "loss": 6.2346,
+      "step": 732
+    },
+    {
+      "epoch": 0.7611630321910696,
+      "grad_norm": 1.765625,
+      "learning_rate": 0.00014949770140714407,
+      "loss": 5.5064,
+      "step": 733
+    },
+    {
+      "epoch": 0.7622014537902388,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0001482741462670193,
+      "loss": 6.3146,
+      "step": 734
+    },
+    {
+      "epoch": 0.7632398753894081,
+      "grad_norm": 1.578125,
+      "learning_rate": 0.0001470547465114263,
+      "loss": 6.2277,
+      "step": 735
+    },
+    {
+      "epoch": 0.7642782969885774,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.00014583951654668415,
+      "loss": 6.3032,
+      "step": 736
+    },
+    {
+      "epoch": 0.7653167185877466,
+      "grad_norm": 1.6484375,
+      "learning_rate": 0.00014462847072984898,
+      "loss": 5.9154,
+      "step": 737
+    },
+    {
+      "epoch": 0.7663551401869159,
+      "grad_norm": 1.6484375,
+      "learning_rate": 0.0001434216233685441,
+      "loss": 6.0951,
+      "step": 738
+    },
+    {
+      "epoch": 0.7673935617860852,
+      "grad_norm": 1.578125,
+      "learning_rate": 0.00014221898872079108,
+      "loss": 6.0921,
+      "step": 739
+    },
+    {
+      "epoch": 0.7684319833852544,
+      "grad_norm": 1.5703125,
+      "learning_rate": 0.0001410205809948419,
+      "loss": 6.2295,
+      "step": 740
+    },
+    {
+      "epoch": 0.7694704049844237,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.00013982641434900984,
+      "loss": 6.229,
+      "step": 741
+    },
+    {
+      "epoch": 0.770508826583593,
+      "grad_norm": 1.5,
+      "learning_rate": 0.00013863650289150338,
+      "loss": 6.3173,
+      "step": 742
+    },
+    {
+      "epoch": 0.7715472481827622,
+      "grad_norm": 1.515625,
+      "learning_rate": 0.00013745086068025857,
+      "loss": 6.3666,
+      "step": 743
+    },
+    {
+      "epoch": 0.7725856697819314,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.00013626950172277398,
+      "loss": 6.1824,
+      "step": 744
+    },
+    {
+      "epoch": 0.7736240913811008,
+      "grad_norm": 1.78125,
+      "learning_rate": 0.00013509243997594423,
+      "loss": 6.2045,
+      "step": 745
+    },
+    {
+      "epoch": 0.77466251298027,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.00013391968934589572,
+      "loss": 6.295,
+      "step": 746
+    },
+    {
+      "epoch": 0.7757009345794392,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.00013275126368782235,
+      "loss": 6.3082,
+      "step": 747
+    },
+    {
+      "epoch": 0.7767393561786086,
+      "grad_norm": 1.5703125,
+      "learning_rate": 0.00013158717680582127,
+      "loss": 6.2444,
+      "step": 748
+    },
+    {
+      "epoch": 0.7777777777777778,
+      "grad_norm": 1.4296875,
+      "learning_rate": 0.00013042744245273037,
+      "loss": 6.1545,
+      "step": 749
+    },
+    {
+      "epoch": 0.778816199376947,
+      "grad_norm": 1.75,
+      "learning_rate": 0.0001292720743299654,
+      "loss": 6.0637,
+      "step": 750
+    },
+    {
+      "epoch": 0.7798546209761164,
+      "grad_norm": 1.4453125,
+      "learning_rate": 0.00012812108608735846,
+      "loss": 6.0392,
+      "step": 751
+    },
+    {
+      "epoch": 0.7808930425752856,
+      "grad_norm": 1.6796875,
+      "learning_rate": 0.0001269744913229965,
+      "loss": 6.285,
+      "step": 752
+    },
+    {
+      "epoch": 0.7819314641744548,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.00012583230358306053,
+      "loss": 6.3178,
+      "step": 753
+    },
+    {
+      "epoch": 0.7829698857736241,
+      "grad_norm": 1.578125,
+      "learning_rate": 0.00012469453636166643,
+      "loss": 6.1123,
+      "step": 754
+    },
+    {
+      "epoch": 0.7840083073727934,
+      "grad_norm": 1.453125,
+      "learning_rate": 0.00012356120310070407,
+      "loss": 6.379,
+      "step": 755
+    },
+    {
+      "epoch": 0.7850467289719626,
+      "grad_norm": 1.625,
+      "learning_rate": 0.00012243231718967967,
+      "loss": 6.127,
+      "step": 756
+    },
+    {
+      "epoch": 0.7860851505711319,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0001213078919655573,
+      "loss": 6.222,
+      "step": 757
+    },
+    {
+      "epoch": 0.7871235721703012,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.00012018794071260119,
+      "loss": 6.0595,
+      "step": 758
+    },
+    {
+      "epoch": 0.7881619937694704,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.00011907247666221893,
+      "loss": 6.1771,
+      "step": 759
+    },
+    {
+      "epoch": 0.7892004153686397,
+      "grad_norm": 1.5,
+      "learning_rate": 0.00011796151299280483,
+      "loss": 6.1493,
+      "step": 760
+    },
+    {
+      "epoch": 0.7902388369678089,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.00011685506282958496,
+      "loss": 6.3724,
+      "step": 761
+    },
+    {
+      "epoch": 0.7912772585669782,
+      "grad_norm": 1.8203125,
+      "learning_rate": 0.00011575313924446123,
+      "loss": 6.1028,
+      "step": 762
+    },
+    {
+      "epoch": 0.7923156801661475,
+      "grad_norm": 1.5234375,
+      "learning_rate": 0.00011465575525585741,
+      "loss": 6.1988,
+      "step": 763
+    },
+    {
+      "epoch": 0.7933541017653167,
+      "grad_norm": 3.03125,
+      "learning_rate": 0.00011356292382856532,
+      "loss": 6.1213,
+      "step": 764
+    },
+    {
+      "epoch": 0.794392523364486,
+      "grad_norm": 1.8515625,
+      "learning_rate": 0.0001124746578735914,
+      "loss": 6.3859,
+      "step": 765
+    },
+    {
+      "epoch": 0.7954309449636553,
+      "grad_norm": 1.8984375,
+      "learning_rate": 0.0001113909702480046,
+      "loss": 6.0653,
+      "step": 766
+    },
+    {
+      "epoch": 0.7964693665628245,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.00011031187375478407,
+      "loss": 6.2933,
+      "step": 767
+    },
+    {
+      "epoch": 0.7975077881619937,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.00010923738114266823,
+      "loss": 6.0991,
+      "step": 768
+    },
+    {
+      "epoch": 0.7985462097611631,
+      "grad_norm": 1.4921875,
+      "learning_rate": 0.00010816750510600387,
+      "loss": 6.2484,
+      "step": 769
+    },
+    {
+      "epoch": 0.7995846313603323,
+      "grad_norm": 1.703125,
+      "learning_rate": 0.00010710225828459641,
+      "loss": 5.7827,
+      "step": 770
+    },
+    {
+      "epoch": 0.8006230529595015,
+      "grad_norm": 1.46875,
+      "learning_rate": 0.0001060416532635603,
+      "loss": 6.3373,
+      "step": 771
+    },
+    {
+      "epoch": 0.8016614745586709,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.00010498570257317076,
+      "loss": 6.3325,
+      "step": 772
+    },
+    {
+      "epoch": 0.8026998961578401,
+      "grad_norm": 1.578125,
+      "learning_rate": 0.00010393441868871506,
+      "loss": 6.1373,
+      "step": 773
+    },
+    {
+      "epoch": 0.8037383177570093,
+      "grad_norm": 1.609375,
+      "learning_rate": 0.00010288781403034619,
+      "loss": 6.06,
+      "step": 774
+    },
+    {
+      "epoch": 0.8047767393561787,
+      "grad_norm": 1.71875,
+      "learning_rate": 0.00010184590096293506,
+      "loss": 6.0622,
+      "step": 775
+    },
+    {
+      "epoch": 0.8058151609553479,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0001008086917959249,
+      "loss": 6.2272,
+      "step": 776
+    },
+    {
+      "epoch": 0.8068535825545171,
+      "grad_norm": 1.3828125,
+      "learning_rate": 9.977619878318578e-05,
+      "loss": 6.2692,
+      "step": 777
+    },
+    {
+      "epoch": 0.8078920041536864,
+      "grad_norm": 1.2265625,
+      "learning_rate": 9.874843412286993e-05,
+      "loss": 6.3817,
+      "step": 778
+    },
+    {
+      "epoch": 0.8089304257528557,
+      "grad_norm": 1.9921875,
+      "learning_rate": 9.772540995726753e-05,
+      "loss": 6.4033,
+      "step": 779
+    },
+    {
+      "epoch": 0.8099688473520249,
+      "grad_norm": 1.625,
+      "learning_rate": 9.67071383726632e-05,
+      "loss": 6.0418,
+      "step": 780
+    },
+    {
+      "epoch": 0.8110072689511942,
+      "grad_norm": 1.3125,
+      "learning_rate": 9.569363139919341e-05,
+      "loss": 6.2407,
+      "step": 781
+    },
+    {
+      "epoch": 0.8120456905503635,
+      "grad_norm": 1.6796875,
+      "learning_rate": 9.468490101070409e-05,
+      "loss": 6.1643,
+      "step": 782
+    },
+    {
+      "epoch": 0.8130841121495327,
+      "grad_norm": 1.1796875,
+      "learning_rate": 9.368095912460934e-05,
+      "loss": 6.1331,
+      "step": 783
+    },
+    {
+      "epoch": 0.814122533748702,
+      "grad_norm": 1.2421875,
+      "learning_rate": 9.26818176017506e-05,
+      "loss": 6.363,
+      "step": 784
+    },
+    {
+      "epoch": 0.8151609553478713,
+      "grad_norm": 1.4609375,
+      "learning_rate": 9.168748824625655e-05,
+      "loss": 6.2178,
+      "step": 785
+    },
+    {
+      "epoch": 0.8161993769470405,
+      "grad_norm": 1.6328125,
+      "learning_rate": 9.069798280540348e-05,
+      "loss": 6.146,
+      "step": 786
+    },
+    {
+      "epoch": 0.8172377985462098,
+      "grad_norm": 1.4453125,
+      "learning_rate": 8.9713312969477e-05,
+      "loss": 6.1887,
+      "step": 787
+    },
+    {
+      "epoch": 0.818276220145379,
+      "grad_norm": 1.2421875,
+      "learning_rate": 8.87334903716332e-05,
+      "loss": 6.2521,
+      "step": 788
+    },
+    {
+      "epoch": 0.8193146417445483,
+      "grad_norm": 1.265625,
+      "learning_rate": 8.775852658776173e-05,
+      "loss": 6.3487,
+      "step": 789
+    },
+    {
+      "epoch": 0.8203530633437176,
+      "grad_norm": 1.5234375,
+      "learning_rate": 8.678843313634893e-05,
+      "loss": 6.2509,
+      "step": 790
+    },
+    {
+      "epoch": 0.8213914849428868,
+      "grad_norm": 1.71875,
+      "learning_rate": 8.58232214783416e-05,
+      "loss": 6.0586,
+      "step": 791
+    },
+    {
+      "epoch": 0.822429906542056,
+      "grad_norm": 1.4921875,
+      "learning_rate": 8.486290301701182e-05,
+      "loss": 6.293,
+      "step": 792
+    },
+    {
+      "epoch": 0.8234683281412254,
+      "grad_norm": 1.5,
+      "learning_rate": 8.390748909782204e-05,
+      "loss": 6.2504,
+      "step": 793
+    },
+    {
+      "epoch": 0.8245067497403946,
+      "grad_norm": 1.28125,
+      "learning_rate": 8.295699100829124e-05,
+      "loss": 6.2907,
+      "step": 794
+    },
+    {
+      "epoch": 0.8255451713395638,
+      "grad_norm": 1.125,
+      "learning_rate": 8.201141997786127e-05,
+      "loss": 6.2033,
+      "step": 795
+    },
+    {
+      "epoch": 0.8265835929387332,
+      "grad_norm": 1.2109375,
+      "learning_rate": 8.107078717776456e-05,
+      "loss": 6.3058,
+      "step": 796
+    },
+    {
+      "epoch": 0.8276220145379024,
+      "grad_norm": 1.4296875,
+      "learning_rate": 8.013510372089184e-05,
+      "loss": 6.1276,
+      "step": 797
+    },
+    {
+      "epoch": 0.8286604361370716,
+      "grad_norm": 1.4140625,
+      "learning_rate": 7.920438066166097e-05,
+      "loss": 6.4023,
+      "step": 798
+    },
+    {
+      "epoch": 0.829698857736241,
+      "grad_norm": 1.953125,
+      "learning_rate": 7.827862899588634e-05,
+      "loss": 6.1487,
+      "step": 799
+    },
+    {
+      "epoch": 0.8307372793354102,
+      "grad_norm": 1.3671875,
+      "learning_rate": 7.735785966064884e-05,
+      "loss": 5.9001,
+      "step": 800
+    },
+    {
+      "epoch": 0.8307372793354102,
+      "eval_loss": 6.255230903625488,
+      "eval_runtime": 1.6449,
+      "eval_samples_per_second": 9.727,
+      "eval_steps_per_second": 1.216,
+      "step": 800
     }
   ],
   "logging_steps": 1,
       "attributes": {}
     }
   },
+  "total_flos": 1.018894554759168e+17,
   "train_batch_size": 16,
   "trial_name": null,
   "trial_params": null