Training in progress, step 153000, checkpoint

Browse files

Files changed (4) hide show

last-checkpoint/model.safetensors +1 -1
last-checkpoint/optimizer.pt +1 -1
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/trainer_state.json +703 -3

last-checkpoint/model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:119fa35511622a0adffa8a0b48170a5bccc46c234ae0a4e5516bc4d1cc3c3bae
 size 222485192

 version https://git-lfs.github.com/spec/v1
+oid sha256:136c76df1e01333390cbfdb110ceb8645de402aa2f9bd1d1073c2092f0435997
 size 222485192

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d08c1930b12ba4aa7e60405395893774f73354f74652895d6f3ce215c968d44a
 size 445094091

 version https://git-lfs.github.com/spec/v1
+oid sha256:06001f506fada90ae093c72a8c5895e578d9ad3a914dc5e4d6c327451c6c0591
 size 445094091

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3a479f20f27be8e95696c065329f168ed4016ec0491fc70f379e4badf157ea18
 size 1465

 version https://git-lfs.github.com/spec/v1
+oid sha256:703fc4033d7149e1bb71c646735942adff51d66cbb62134a1d420c80d1aea545
 size 1465

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -2,9 +2,9 @@
   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 1.1225854018186288,
   "eval_steps": 500,
-  "global_step": 152000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -106408,6 +106408,706 @@
       "learning_rate": 0.00015721189632451183,
       "loss": 0.4339,
       "step": 152000
     }
   ],
   "logging_steps": 10,
@@ -106427,7 +107127,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 1.508395182600015e+19,
   "train_batch_size": 2048,
   "trial_name": null,
   "trial_params": null

   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 1.1266814123044155,
   "eval_steps": 500,
+  "global_step": 153000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "learning_rate": 0.00015721189632451183,
       "loss": 0.4339,
       "step": 152000
+    },
+    {
+      "epoch": 1.1226263619234866,
+      "grad_norm": 0.21575403213500977,
+      "learning_rate": 0.00015718190208754518,
+      "loss": 0.4372,
+      "step": 152010
+    },
+    {
+      "epoch": 1.1226673220283443,
+      "grad_norm": 0.21100935339927673,
+      "learning_rate": 0.00015715190940017844,
+      "loss": 0.4255,
+      "step": 152020
+    },
+    {
+      "epoch": 1.1227082821332022,
+      "grad_norm": 0.1999680995941162,
+      "learning_rate": 0.00015712191826291233,
+      "loss": 0.4168,
+      "step": 152030
+    },
+    {
+      "epoch": 1.12274924223806,
+      "grad_norm": 0.2585766315460205,
+      "learning_rate": 0.00015709192867624755,
+      "loss": 0.41,
+      "step": 152040
+    },
+    {
+      "epoch": 1.122790202342918,
+      "grad_norm": 0.2113899439573288,
+      "learning_rate": 0.00015706194064068487,
+      "loss": 0.4045,
+      "step": 152050
+    },
+    {
+      "epoch": 1.1228311624477758,
+      "grad_norm": 0.20225553214550018,
+      "learning_rate": 0.00015703195415672484,
+      "loss": 0.3972,
+      "step": 152060
+    },
+    {
+      "epoch": 1.1228721225526337,
+      "grad_norm": 0.19689704477787018,
+      "learning_rate": 0.00015700196922486807,
+      "loss": 0.385,
+      "step": 152070
+    },
+    {
+      "epoch": 1.1229130826574916,
+      "grad_norm": 0.21756672859191895,
+      "learning_rate": 0.00015697198584561524,
+      "loss": 0.3743,
+      "step": 152080
+    },
+    {
+      "epoch": 1.1229540427623494,
+      "grad_norm": 0.2084749937057495,
+      "learning_rate": 0.0001569420040194668,
+      "loss": 0.3669,
+      "step": 152090
+    },
+    {
+      "epoch": 1.1229950028672073,
+      "grad_norm": 0.20592711865901947,
+      "learning_rate": 0.00015691202374692347,
+      "loss": 0.3539,
+      "step": 152100
+    },
+    {
+      "epoch": 1.1230359629720652,
+      "grad_norm": 0.19951869547367096,
+      "learning_rate": 0.00015688204502848564,
+      "loss": 0.3473,
+      "step": 152110
+    },
+    {
+      "epoch": 1.123076923076923,
+      "grad_norm": 0.21034152805805206,
+      "learning_rate": 0.00015685206786465383,
+      "loss": 0.3277,
+      "step": 152120
+    },
+    {
+      "epoch": 1.123117883181781,
+      "grad_norm": 0.20541854202747345,
+      "learning_rate": 0.00015682209225592863,
+      "loss": 0.321,
+      "step": 152130
+    },
+    {
+      "epoch": 1.1231588432866388,
+      "grad_norm": 0.212013378739357,
+      "learning_rate": 0.00015679211820281028,
+      "loss": 0.3059,
+      "step": 152140
+    },
+    {
+      "epoch": 1.1231998033914967,
+      "grad_norm": 0.19732996821403503,
+      "learning_rate": 0.00015676214570579933,
+      "loss": 0.2932,
+      "step": 152150
+    },
+    {
+      "epoch": 1.1232407634963546,
+      "grad_norm": 0.19249044358730316,
+      "learning_rate": 0.0001567321747653961,
+      "loss": 0.2744,
+      "step": 152160
+    },
+    {
+      "epoch": 1.1232817236012125,
+      "grad_norm": 0.18894243240356445,
+      "learning_rate": 0.00015670220538210102,
+      "loss": 0.2646,
+      "step": 152170
+    },
+    {
+      "epoch": 1.1233226837060704,
+      "grad_norm": 0.20343731343746185,
+      "learning_rate": 0.00015667223755641435,
+      "loss": 0.2584,
+      "step": 152180
+    },
+    {
+      "epoch": 1.1233636438109282,
+      "grad_norm": 0.20398001372814178,
+      "learning_rate": 0.00015664227128883655,
+      "loss": 0.2443,
+      "step": 152190
+    },
+    {
+      "epoch": 1.1234046039157861,
+      "grad_norm": 0.1968417465686798,
+      "learning_rate": 0.00015661230657986784,
+      "loss": 0.2242,
+      "step": 152200
+    },
+    {
+      "epoch": 1.123445564020644,
+      "grad_norm": 0.17249369621276855,
+      "learning_rate": 0.00015658234343000836,
+      "loss": 0.2158,
+      "step": 152210
+    },
+    {
+      "epoch": 1.1234865241255019,
+      "grad_norm": 0.1990642249584198,
+      "learning_rate": 0.00015655238183975845,
+      "loss": 0.2143,
+      "step": 152220
+    },
+    {
+      "epoch": 1.1235274842303595,
+      "grad_norm": 0.19265955686569214,
+      "learning_rate": 0.0001565224218096183,
+      "loss": 0.2199,
+      "step": 152230
+    },
+    {
+      "epoch": 1.1235684443352174,
+      "grad_norm": 0.18196046352386475,
+      "learning_rate": 0.00015649246334008816,
+      "loss": 0.272,
+      "step": 152240
+    },
+    {
+      "epoch": 1.1236094044400753,
+      "grad_norm": 0.21095934510231018,
+      "learning_rate": 0.00015646250643166804,
+      "loss": 0.3358,
+      "step": 152250
+    },
+    {
+      "epoch": 1.1236503645449332,
+      "grad_norm": 0.21196311712265015,
+      "learning_rate": 0.00015643255108485822,
+      "loss": 0.3869,
+      "step": 152260
+    },
+    {
+      "epoch": 1.123691324649791,
+      "grad_norm": 0.2193775326013565,
+      "learning_rate": 0.00015640259730015883,
+      "loss": 0.4176,
+      "step": 152270
+    },
+    {
+      "epoch": 1.123732284754649,
+      "grad_norm": 0.23009201884269714,
+      "learning_rate": 0.00015637264507806976,
+      "loss": 0.4597,
+      "step": 152280
+    },
+    {
+      "epoch": 1.1237732448595068,
+      "grad_norm": 0.22631536424160004,
+      "learning_rate": 0.0001563426944190912,
+      "loss": 0.4829,
+      "step": 152290
+    },
+    {
+      "epoch": 1.1238142049643647,
+      "grad_norm": 0.22960945963859558,
+      "learning_rate": 0.00015631274532372307,
+      "loss": 0.4984,
+      "step": 152300
+    },
+    {
+      "epoch": 1.1238551650692226,
+      "grad_norm": 0.22582904994487762,
+      "learning_rate": 0.00015628279779246552,
+      "loss": 0.5136,
+      "step": 152310
+    },
+    {
+      "epoch": 1.1238961251740804,
+      "grad_norm": 0.25222963094711304,
+      "learning_rate": 0.00015625285182581844,
+      "loss": 0.5348,
+      "step": 152320
+    },
+    {
+      "epoch": 1.1239370852789383,
+      "grad_norm": 0.24137544631958008,
+      "learning_rate": 0.00015622290742428173,
+      "loss": 0.5435,
+      "step": 152330
+    },
+    {
+      "epoch": 1.1239780453837962,
+      "grad_norm": 0.2230043262243271,
+      "learning_rate": 0.00015619296458835557,
+      "loss": 0.5384,
+      "step": 152340
+    },
+    {
+      "epoch": 1.124019005488654,
+      "grad_norm": 0.23206926882266998,
+      "learning_rate": 0.0001561630233185395,
+      "loss": 0.5469,
+      "step": 152350
+    },
+    {
+      "epoch": 1.124059965593512,
+      "grad_norm": 0.2324897199869156,
+      "learning_rate": 0.00015613308361533352,
+      "loss": 0.5541,
+      "step": 152360
+    },
+    {
+      "epoch": 1.1241009256983698,
+      "grad_norm": 0.22080039978027344,
+      "learning_rate": 0.00015610314547923754,
+      "loss": 0.5526,
+      "step": 152370
+    },
+    {
+      "epoch": 1.1241418858032277,
+      "grad_norm": 0.2413584589958191,
+      "learning_rate": 0.0001560732089107513,
+      "loss": 0.5607,
+      "step": 152380
+    },
+    {
+      "epoch": 1.1241828459080856,
+      "grad_norm": 0.22935831546783447,
+      "learning_rate": 0.0001560432739103747,
+      "loss": 0.5558,
+      "step": 152390
+    },
+    {
+      "epoch": 1.1242238060129435,
+      "grad_norm": 0.24277865886688232,
+      "learning_rate": 0.00015601334047860737,
+      "loss": 0.5689,
+      "step": 152400
+    },
+    {
+      "epoch": 1.1242647661178014,
+      "grad_norm": 0.23572035133838654,
+      "learning_rate": 0.00015598340861594924,
+      "loss": 0.5617,
+      "step": 152410
+    },
+    {
+      "epoch": 1.1243057262226592,
+      "grad_norm": 0.25575461983680725,
+      "learning_rate": 0.0001559534783228998,
+      "loss": 0.5631,
+      "step": 152420
+    },
+    {
+      "epoch": 1.124346686327517,
+      "grad_norm": 0.22678613662719727,
+      "learning_rate": 0.00015592354959995884,
+      "loss": 0.5644,
+      "step": 152430
+    },
+    {
+      "epoch": 1.1243876464323748,
+      "grad_norm": 0.22008901834487915,
+      "learning_rate": 0.00015589362244762601,
+      "loss": 0.5612,
+      "step": 152440
+    },
+    {
+      "epoch": 1.1244286065372326,
+      "grad_norm": 0.23076428472995758,
+      "learning_rate": 0.00015586369686640094,
+      "loss": 0.5678,
+      "step": 152450
+    },
+    {
+      "epoch": 1.1244695666420905,
+      "grad_norm": 0.2724132835865021,
+      "learning_rate": 0.0001558337728567833,
+      "loss": 0.5601,
+      "step": 152460
+    },
+    {
+      "epoch": 1.1245105267469484,
+      "grad_norm": 0.23742075264453888,
+      "learning_rate": 0.00015580385041927253,
+      "loss": 0.5624,
+      "step": 152470
+    },
+    {
+      "epoch": 1.1245514868518063,
+      "grad_norm": 0.24539630115032196,
+      "learning_rate": 0.00015577392955436843,
+      "loss": 0.5599,
+      "step": 152480
+    },
+    {
+      "epoch": 1.1245924469566642,
+      "grad_norm": 0.24014654755592346,
+      "learning_rate": 0.00015574401026257029,
+      "loss": 0.562,
+      "step": 152490
+    },
+    {
+      "epoch": 1.124633407061522,
+      "grad_norm": 0.22930192947387695,
+      "learning_rate": 0.00015571409254437765,
+      "loss": 0.5637,
+      "step": 152500
+    },
+    {
+      "epoch": 1.12467436716638,
+      "grad_norm": 0.2182847410440445,
+      "learning_rate": 0.00015568417640029008,
+      "loss": 0.557,
+      "step": 152510
+    },
+    {
+      "epoch": 1.1247153272712378,
+      "grad_norm": 0.23275041580200195,
+      "learning_rate": 0.00015565426183080698,
+      "loss": 0.5562,
+      "step": 152520
+    },
+    {
+      "epoch": 1.1247562873760957,
+      "grad_norm": 0.24153053760528564,
+      "learning_rate": 0.00015562434883642777,
+      "loss": 0.5614,
+      "step": 152530
+    },
+    {
+      "epoch": 1.1247972474809536,
+      "grad_norm": 0.23341676592826843,
+      "learning_rate": 0.00015559443741765182,
+      "loss": 0.5611,
+      "step": 152540
+    },
+    {
+      "epoch": 1.1248382075858114,
+      "grad_norm": 0.2416403889656067,
+      "learning_rate": 0.00015556452757497868,
+      "loss": 0.5661,
+      "step": 152550
+    },
+    {
+      "epoch": 1.1248791676906693,
+      "grad_norm": 0.2251540720462799,
+      "learning_rate": 0.0001555346193089075,
+      "loss": 0.5587,
+      "step": 152560
+    },
+    {
+      "epoch": 1.1249201277955272,
+      "grad_norm": 0.23912055790424347,
+      "learning_rate": 0.00015550471261993755,
+      "loss": 0.5508,
+      "step": 152570
+    },
+    {
+      "epoch": 1.124961087900385,
+      "grad_norm": 0.24343754351139069,
+      "learning_rate": 0.00015547480750856835,
+      "loss": 0.5569,
+      "step": 152580
+    },
+    {
+      "epoch": 1.125002048005243,
+      "grad_norm": 0.2361503690481186,
+      "learning_rate": 0.0001554449039752989,
+      "loss": 0.5644,
+      "step": 152590
+    },
+    {
+      "epoch": 1.1250430081101008,
+      "grad_norm": 0.22848433256149292,
+      "learning_rate": 0.00015541500202062873,
+      "loss": 0.5584,
+      "step": 152600
+    },
+    {
+      "epoch": 1.1250839682149587,
+      "grad_norm": 0.2590723931789398,
+      "learning_rate": 0.0001553851016450568,
+      "loss": 0.566,
+      "step": 152610
+    },
+    {
+      "epoch": 1.1251249283198166,
+      "grad_norm": 0.24066004157066345,
+      "learning_rate": 0.00015535520284908253,
+      "loss": 0.5611,
+      "step": 152620
+    },
+    {
+      "epoch": 1.1251658884246742,
+      "grad_norm": 0.23049475252628326,
+      "learning_rate": 0.0001553253056332049,
+      "loss": 0.558,
+      "step": 152630
+    },
+    {
+      "epoch": 1.1252068485295323,
+      "grad_norm": 0.23786190152168274,
+      "learning_rate": 0.00015529540999792302,
+      "loss": 0.5689,
+      "step": 152640
+    },
+    {
+      "epoch": 1.12524780863439,
+      "grad_norm": 0.223163902759552,
+      "learning_rate": 0.00015526551594373616,
+      "loss": 0.5611,
+      "step": 152650
+    },
+    {
+      "epoch": 1.1252887687392479,
+      "grad_norm": 0.24785561859607697,
+      "learning_rate": 0.00015523562347114327,
+      "loss": 0.5631,
+      "step": 152660
+    },
+    {
+      "epoch": 1.1253297288441058,
+      "grad_norm": 0.22849705815315247,
+      "learning_rate": 0.00015520573258064353,
+      "loss": 0.5621,
+      "step": 152670
+    },
+    {
+      "epoch": 1.1253706889489636,
+      "grad_norm": 0.2411494106054306,
+      "learning_rate": 0.00015517584327273578,
+      "loss": 0.5631,
+      "step": 152680
+    },
+    {
+      "epoch": 1.1254116490538215,
+      "grad_norm": 0.22378501296043396,
+      "learning_rate": 0.00015514595554791928,
+      "loss": 0.5535,
+      "step": 152690
+    },
+    {
+      "epoch": 1.1254526091586794,
+      "grad_norm": 0.23733602464199066,
+      "learning_rate": 0.0001551160694066928,
+      "loss": 0.5567,
+      "step": 152700
+    },
+    {
+      "epoch": 1.1254935692635373,
+      "grad_norm": 0.26041272282600403,
+      "learning_rate": 0.00015508618484955534,
+      "loss": 0.5536,
+      "step": 152710
+    },
+    {
+      "epoch": 1.1255345293683952,
+      "grad_norm": 0.2391430139541626,
+      "learning_rate": 0.0001550563018770058,
+      "loss": 0.5605,
+      "step": 152720
+    },
+    {
+      "epoch": 1.125575489473253,
+      "grad_norm": 0.23212002217769623,
+      "learning_rate": 0.00015502642048954312,
+      "loss": 0.5648,
+      "step": 152730
+    },
+    {
+      "epoch": 1.125616449578111,
+      "grad_norm": 0.21461953222751617,
+      "learning_rate": 0.00015499654068766623,
+      "loss": 0.5558,
+      "step": 152740
+    },
+    {
+      "epoch": 1.1256574096829688,
+      "grad_norm": 0.2368936687707901,
+      "learning_rate": 0.00015496666247187386,
+      "loss": 0.5617,
+      "step": 152750
+    },
+    {
+      "epoch": 1.1256983697878267,
+      "grad_norm": 0.23703087866306305,
+      "learning_rate": 0.00015493678584266494,
+      "loss": 0.5562,
+      "step": 152760
+    },
+    {
+      "epoch": 1.1257393298926845,
+      "grad_norm": 0.22494572401046753,
+      "learning_rate": 0.00015490691080053814,
+      "loss": 0.5558,
+      "step": 152770
+    },
+    {
+      "epoch": 1.1257802899975424,
+      "grad_norm": 0.23022456467151642,
+      "learning_rate": 0.00015487703734599225,
+      "loss": 0.555,
+      "step": 152780
+    },
+    {
+      "epoch": 1.1258212501024003,
+      "grad_norm": 0.2356519103050232,
+      "learning_rate": 0.0001548471654795261,
+      "loss": 0.5531,
+      "step": 152790
+    },
+    {
+      "epoch": 1.1258622102072582,
+      "grad_norm": 0.21882906556129456,
+      "learning_rate": 0.00015481729520163825,
+      "loss": 0.5537,
+      "step": 152800
+    },
+    {
+      "epoch": 1.125903170312116,
+      "grad_norm": 0.23445099592208862,
+      "learning_rate": 0.00015478742651282756,
+      "loss": 0.5586,
+      "step": 152810
+    },
+    {
+      "epoch": 1.125944130416974,
+      "grad_norm": 0.23974089324474335,
+      "learning_rate": 0.00015475755941359254,
+      "loss": 0.5556,
+      "step": 152820
+    },
+    {
+      "epoch": 1.1259850905218318,
+      "grad_norm": 0.2311934232711792,
+      "learning_rate": 0.00015472769390443197,
+      "loss": 0.5555,
+      "step": 152830
+    },
+    {
+      "epoch": 1.1260260506266897,
+      "grad_norm": 0.22662296891212463,
+      "learning_rate": 0.00015469782998584438,
+      "loss": 0.5466,
+      "step": 152840
+    },
+    {
+      "epoch": 1.1260670107315476,
+      "grad_norm": 0.227142795920372,
+      "learning_rate": 0.00015466796765832818,
+      "loss": 0.545,
+      "step": 152850
+    },
+    {
+      "epoch": 1.1261079708364052,
+      "grad_norm": 0.22120416164398193,
+      "learning_rate": 0.00015463810692238224,
+      "loss": 0.5509,
+      "step": 152860
+    },
+    {
+      "epoch": 1.1261489309412631,
+      "grad_norm": 0.22597011923789978,
+      "learning_rate": 0.0001546082477785048,
+      "loss": 0.554,
+      "step": 152870
+    },
+    {
+      "epoch": 1.126189891046121,
+      "grad_norm": 0.24797473847866058,
+      "learning_rate": 0.00015457839022719455,
+      "loss": 0.5597,
+      "step": 152880
+    },
+    {
+      "epoch": 1.1262308511509789,
+      "grad_norm": 0.25205472111701965,
+      "learning_rate": 0.0001545485342689499,
+      "loss": 0.5519,
+      "step": 152890
+    },
+    {
+      "epoch": 1.1262718112558368,
+      "grad_norm": 0.21713557839393616,
+      "learning_rate": 0.00015451867990426933,
+      "loss": 0.5521,
+      "step": 152900
+    },
+    {
+      "epoch": 1.1263127713606946,
+      "grad_norm": 0.22947101294994354,
+      "learning_rate": 0.0001544888271336512,
+      "loss": 0.5572,
+      "step": 152910
+    },
+    {
+      "epoch": 1.1263537314655525,
+      "grad_norm": 0.22842080891132355,
+      "learning_rate": 0.00015445897595759385,
+      "loss": 0.5555,
+      "step": 152920
+    },
+    {
+      "epoch": 1.1263946915704104,
+      "grad_norm": 0.2273808866739273,
+      "learning_rate": 0.00015442912637659577,
+      "loss": 0.5538,
+      "step": 152930
+    },
+    {
+      "epoch": 1.1264356516752683,
+      "grad_norm": 0.2290753871202469,
+      "learning_rate": 0.00015439927839115526,
+      "loss": 0.5474,
+      "step": 152940
+    },
+    {
+      "epoch": 1.1264766117801261,
+      "grad_norm": 0.23517906665802002,
+      "learning_rate": 0.0001543694320017706,
+      "loss": 0.5495,
+      "step": 152950
+    },
+    {
+      "epoch": 1.126517571884984,
+      "grad_norm": 0.22325338423252106,
+      "learning_rate": 0.00015433958720894008,
+      "loss": 0.5476,
+      "step": 152960
+    },
+    {
+      "epoch": 1.126558531989842,
+      "grad_norm": 0.23649455606937408,
+      "learning_rate": 0.00015430974401316208,
+      "loss": 0.545,
+      "step": 152970
+    },
+    {
+      "epoch": 1.1265994920946998,
+      "grad_norm": 0.232147678732872,
+      "learning_rate": 0.00015427990241493467,
+      "loss": 0.554,
+      "step": 152980
+    },
+    {
+      "epoch": 1.1266404521995577,
+      "grad_norm": 0.249918133020401,
+      "learning_rate": 0.0001542500624147561,
+      "loss": 0.5385,
+      "step": 152990
+    },
+    {
+      "epoch": 1.1266814123044155,
+      "grad_norm": 0.23529894649982452,
+      "learning_rate": 0.0001542202240131246,
+      "loss": 0.5526,
+      "step": 153000
     }
   ],
   "logging_steps": 10,
       "attributes": {}
     }
   },
+  "total_flos": 1.5183189452904727e+19,
   "train_batch_size": 2048,
   "trial_name": null,
   "trial_params": null