Training in progress, step 2200, checkpoint

Browse files

Files changed (5) hide show

last-checkpoint/model.safetensors +1 -1
last-checkpoint/optimizer.pt +1 -1
last-checkpoint/rng_state.pth +1 -1
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/trainer_state.json +711 -3

last-checkpoint/model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bde8d274ae01751106fa8159c69f54f42fa4a03cef59221fd3a97b874a59ce77
 size 2066752

 version https://git-lfs.github.com/spec/v1
+oid sha256:34e7dd2d9b6f0970cb6393fa01c4d5b46e08a118b2ca11c92398326a18aca9b6
 size 2066752

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3889beeb89792558f533d92bd93d1146cb11bb276fefd901c31f68cb3738bbd7
 size 4121235

 version https://git-lfs.github.com/spec/v1
+oid sha256:8ea6c445c08f665e093952010dc41c9cfe5bc6fd09fae8a9ddc99dbd25132738
 size 4121235

last-checkpoint/rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b5647e487afcf90db880b131ed9aa10793319336216d29341b095ec293227383
 size 14391

 version https://git-lfs.github.com/spec/v1
+oid sha256:606f8ae83137b5e17dffec803b5eb8d484f9023ac65a91db2b3909da806f7963
 size 14391

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f370f5706e3b0b381f57dfdce48d261ac33ac3b1e555ecac720ab1b913f1626d
 size 1401

 version https://git-lfs.github.com/spec/v1
+oid sha256:93112c230b7ca5a82c24435d90248d5e745b06d96f80a988308b962666674dd0
 size 1401

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -2,9 +2,9 @@
   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.09069707178025395,
   "eval_steps": 100,
-  "global_step": 2100,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -14876,6 +14876,714 @@
       "eval_samples_per_second": 1.709,
       "eval_steps_per_second": 0.214,
       "step": 2100
     }
   ],
   "logging_steps": 1,
@@ -14895,7 +15603,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 6712757452800.0,
   "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null

   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.09501597996026605,
   "eval_steps": 100,
+  "global_step": 2200,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "eval_samples_per_second": 1.709,
       "eval_steps_per_second": 0.214,
       "step": 2100
+    },
+    {
+      "epoch": 0.09074026086205407,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0009954814517481774,
+      "loss": 8.298,
+      "step": 2101
+    },
+    {
+      "epoch": 0.09078344994385419,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0009954718676495817,
+      "loss": 8.5746,
+      "step": 2102
+    },
+    {
+      "epoch": 0.09082663902565431,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0009954622734437753,
+      "loss": 8.4441,
+      "step": 2103
+    },
+    {
+      "epoch": 0.09086982810745443,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.000995452669130954,
+      "loss": 8.5023,
+      "step": 2104
+    },
+    {
+      "epoch": 0.09091301718925456,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.0009954430547113133,
+      "loss": 7.9511,
+      "step": 2105
+    },
+    {
+      "epoch": 0.09095620627105468,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0009954334301850497,
+      "loss": 7.9691,
+      "step": 2106
+    },
+    {
+      "epoch": 0.0909993953528548,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0009954237955523593,
+      "loss": 8.1564,
+      "step": 2107
+    },
+    {
+      "epoch": 0.09104258443465492,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0009954141508134388,
+      "loss": 8.5454,
+      "step": 2108
+    },
+    {
+      "epoch": 0.09108577351645504,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.000995404495968485,
+      "loss": 8.4482,
+      "step": 2109
+    },
+    {
+      "epoch": 0.09112896259825516,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.0009953948310176945,
+      "loss": 8.6743,
+      "step": 2110
+    },
+    {
+      "epoch": 0.09117215168005528,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0009953851559612648,
+      "loss": 8.4618,
+      "step": 2111
+    },
+    {
+      "epoch": 0.0912153407618554,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0009953754707993931,
+      "loss": 8.4147,
+      "step": 2112
+    },
+    {
+      "epoch": 0.09125852984365553,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.000995365775532277,
+      "loss": 8.43,
+      "step": 2113
+    },
+    {
+      "epoch": 0.09130171892545565,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0009953560701601145,
+      "loss": 8.3902,
+      "step": 2114
+    },
+    {
+      "epoch": 0.09134490800725577,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.0009953463546831032,
+      "loss": 8.3769,
+      "step": 2115
+    },
+    {
+      "epoch": 0.09138809708905589,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.0009953366291014414,
+      "loss": 8.1538,
+      "step": 2116
+    },
+    {
+      "epoch": 0.09143128617085601,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0009953268934153278,
+      "loss": 8.4314,
+      "step": 2117
+    },
+    {
+      "epoch": 0.09147447525265613,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0009953171476249606,
+      "loss": 8.3317,
+      "step": 2118
+    },
+    {
+      "epoch": 0.09151766433445625,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.0009953073917305386,
+      "loss": 8.5086,
+      "step": 2119
+    },
+    {
+      "epoch": 0.09156085341625637,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0009952976257322612,
+      "loss": 8.4208,
+      "step": 2120
+    },
+    {
+      "epoch": 0.0916040424980565,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.0009952878496303273,
+      "loss": 8.0259,
+      "step": 2121
+    },
+    {
+      "epoch": 0.09164723157985662,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0009952780634249366,
+      "loss": 8.3618,
+      "step": 2122
+    },
+    {
+      "epoch": 0.09169042066165674,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0009952682671162884,
+      "loss": 8.4246,
+      "step": 2123
+    },
+    {
+      "epoch": 0.09173360974345686,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.0009952584607045827,
+      "loss": 8.4246,
+      "step": 2124
+    },
+    {
+      "epoch": 0.09177679882525698,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0009952486441900195,
+      "loss": 8.729,
+      "step": 2125
+    },
+    {
+      "epoch": 0.0918199879070571,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.000995238817572799,
+      "loss": 8.2617,
+      "step": 2126
+    },
+    {
+      "epoch": 0.09186317698885722,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.000995228980853122,
+      "loss": 7.8786,
+      "step": 2127
+    },
+    {
+      "epoch": 0.09190636607065734,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0009952191340311886,
+      "loss": 8.3024,
+      "step": 2128
+    },
+    {
+      "epoch": 0.09194955515245747,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0009952092771072002,
+      "loss": 8.3837,
+      "step": 2129
+    },
+    {
+      "epoch": 0.09199274423425757,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0009951994100813575,
+      "loss": 8.2394,
+      "step": 2130
+    },
+    {
+      "epoch": 0.0920359333160577,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.0009951895329538619,
+      "loss": 8.3308,
+      "step": 2131
+    },
+    {
+      "epoch": 0.09207912239785782,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0009951796457249148,
+      "loss": 8.1792,
+      "step": 2132
+    },
+    {
+      "epoch": 0.09212231147965794,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0009951697483947181,
+      "loss": 8.358,
+      "step": 2133
+    },
+    {
+      "epoch": 0.09216550056145806,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0009951598409634738,
+      "loss": 8.2675,
+      "step": 2134
+    },
+    {
+      "epoch": 0.09220868964325818,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0009951499234313835,
+      "loss": 8.6303,
+      "step": 2135
+    },
+    {
+      "epoch": 0.0922518787250583,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.00099513999579865,
+      "loss": 8.2709,
+      "step": 2136
+    },
+    {
+      "epoch": 0.09229506780685842,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0009951300580654755,
+      "loss": 8.5636,
+      "step": 2137
+    },
+    {
+      "epoch": 0.09233825688865854,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0009951201102320628,
+      "loss": 8.3999,
+      "step": 2138
+    },
+    {
+      "epoch": 0.09238144597045866,
+      "grad_norm": 3.6875,
+      "learning_rate": 0.0009951101522986147,
+      "loss": 8.4522,
+      "step": 2139
+    },
+    {
+      "epoch": 0.09242463505225879,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0009951001842653348,
+      "loss": 8.4834,
+      "step": 2140
+    },
+    {
+      "epoch": 0.0924678241340589,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0009950902061324261,
+      "loss": 8.6161,
+      "step": 2141
+    },
+    {
+      "epoch": 0.09251101321585903,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0009950802179000923,
+      "loss": 8.5365,
+      "step": 2142
+    },
+    {
+      "epoch": 0.09255420229765915,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0009950702195685366,
+      "loss": 8.1847,
+      "step": 2143
+    },
+    {
+      "epoch": 0.09259739137945927,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.0009950602111379635,
+      "loss": 8.4196,
+      "step": 2144
+    },
+    {
+      "epoch": 0.09264058046125939,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0009950501926085772,
+      "loss": 8.6901,
+      "step": 2145
+    },
+    {
+      "epoch": 0.09268376954305951,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0009950401639805821,
+      "loss": 8.2957,
+      "step": 2146
+    },
+    {
+      "epoch": 0.09272695862485963,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0009950301252541823,
+      "loss": 8.0223,
+      "step": 2147
+    },
+    {
+      "epoch": 0.09277014770665976,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.000995020076429583,
+      "loss": 8.5349,
+      "step": 2148
+    },
+    {
+      "epoch": 0.09281333678845988,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.000995010017506989,
+      "loss": 8.454,
+      "step": 2149
+    },
+    {
+      "epoch": 0.09285652587026,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0009949999484866058,
+      "loss": 8.4627,
+      "step": 2150
+    },
+    {
+      "epoch": 0.09289971495206012,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0009949898693686384,
+      "loss": 8.4625,
+      "step": 2151
+    },
+    {
+      "epoch": 0.09294290403386024,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0009949797801532924,
+      "loss": 8.3837,
+      "step": 2152
+    },
+    {
+      "epoch": 0.09298609311566036,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.0009949696808407738,
+      "loss": 8.3657,
+      "step": 2153
+    },
+    {
+      "epoch": 0.09302928219746048,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0009949595714312887,
+      "loss": 8.4035,
+      "step": 2154
+    },
+    {
+      "epoch": 0.0930724712792606,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.0009949494519250434,
+      "loss": 8.3168,
+      "step": 2155
+    },
+    {
+      "epoch": 0.09311566036106073,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.000994939322322244,
+      "loss": 8.3687,
+      "step": 2156
+    },
+    {
+      "epoch": 0.09315884944286085,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.000994929182623097,
+      "loss": 8.1541,
+      "step": 2157
+    },
+    {
+      "epoch": 0.09320203852466097,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.0009949190328278098,
+      "loss": 8.3242,
+      "step": 2158
+    },
+    {
+      "epoch": 0.09324522760646109,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0009949088729365894,
+      "loss": 8.7448,
+      "step": 2159
+    },
+    {
+      "epoch": 0.09328841668826121,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.0009948987029496426,
+      "loss": 8.3553,
+      "step": 2160
+    },
+    {
+      "epoch": 0.09333160577006133,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.0009948885228671768,
+      "loss": 8.2736,
+      "step": 2161
+    },
+    {
+      "epoch": 0.09337479485186145,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.0009948783326894004,
+      "loss": 8.3592,
+      "step": 2162
+    },
+    {
+      "epoch": 0.09341798393366157,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.0009948681324165206,
+      "loss": 8.5712,
+      "step": 2163
+    },
+    {
+      "epoch": 0.0934611730154617,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0009948579220487458,
+      "loss": 8.4489,
+      "step": 2164
+    },
+    {
+      "epoch": 0.09350436209726182,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.000994847701586284,
+      "loss": 8.3211,
+      "step": 2165
+    },
+    {
+      "epoch": 0.09354755117906194,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.0009948374710293442,
+      "loss": 8.5147,
+      "step": 2166
+    },
+    {
+      "epoch": 0.09359074026086206,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0009948272303781345,
+      "loss": 8.5106,
+      "step": 2167
+    },
+    {
+      "epoch": 0.09363392934266218,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0009948169796328641,
+      "loss": 8.3855,
+      "step": 2168
+    },
+    {
+      "epoch": 0.0936771184244623,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.000994806718793742,
+      "loss": 8.5642,
+      "step": 2169
+    },
+    {
+      "epoch": 0.09372030750626242,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0009947964478609777,
+      "loss": 8.4041,
+      "step": 2170
+    },
+    {
+      "epoch": 0.09376349658806253,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0009947861668347807,
+      "loss": 8.6566,
+      "step": 2171
+    },
+    {
+      "epoch": 0.09380668566986265,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0009947758757153605,
+      "loss": 8.3748,
+      "step": 2172
+    },
+    {
+      "epoch": 0.09384987475166277,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.000994765574502927,
+      "loss": 8.4505,
+      "step": 2173
+    },
+    {
+      "epoch": 0.0938930638334629,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.0009947552631976908,
+      "loss": 8.6221,
+      "step": 2174
+    },
+    {
+      "epoch": 0.09393625291526302,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0009947449417998617,
+      "loss": 8.1772,
+      "step": 2175
+    },
+    {
+      "epoch": 0.09397944199706314,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.0009947346103096506,
+      "loss": 8.2067,
+      "step": 2176
+    },
+    {
+      "epoch": 0.09402263107886326,
+      "grad_norm": 1.5078125,
+      "learning_rate": 0.000994724268727268,
+      "loss": 8.7464,
+      "step": 2177
+    },
+    {
+      "epoch": 0.09406582016066338,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.000994713917052925,
+      "loss": 8.4471,
+      "step": 2178
+    },
+    {
+      "epoch": 0.0941090092424635,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.000994703555286833,
+      "loss": 8.1524,
+      "step": 2179
+    },
+    {
+      "epoch": 0.09415219832426362,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.000994693183429203,
+      "loss": 8.7342,
+      "step": 2180
+    },
+    {
+      "epoch": 0.09419538740606374,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.0009946828014802467,
+      "loss": 8.5416,
+      "step": 2181
+    },
+    {
+      "epoch": 0.09423857648786386,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.000994672409440176,
+      "loss": 8.4049,
+      "step": 2182
+    },
+    {
+      "epoch": 0.09428176556966399,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0009946620073092026,
+      "loss": 8.5609,
+      "step": 2183
+    },
+    {
+      "epoch": 0.09432495465146411,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.000994651595087539,
+      "loss": 8.3572,
+      "step": 2184
+    },
+    {
+      "epoch": 0.09436814373326423,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.0009946411727753974,
+      "loss": 8.262,
+      "step": 2185
+    },
+    {
+      "epoch": 0.09441133281506435,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0009946307403729906,
+      "loss": 8.6386,
+      "step": 2186
+    },
+    {
+      "epoch": 0.09445452189686447,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0009946202978805313,
+      "loss": 8.4235,
+      "step": 2187
+    },
+    {
+      "epoch": 0.09449771097866459,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.0009946098452982325,
+      "loss": 8.6327,
+      "step": 2188
+    },
+    {
+      "epoch": 0.09454090006046471,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0009945993826263072,
+      "loss": 8.3352,
+      "step": 2189
+    },
+    {
+      "epoch": 0.09458408914226483,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0009945889098649692,
+      "loss": 8.0024,
+      "step": 2190
+    },
+    {
+      "epoch": 0.09462727822406496,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0009945784270144321,
+      "loss": 8.2995,
+      "step": 2191
+    },
+    {
+      "epoch": 0.09467046730586508,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.0009945679340749097,
+      "loss": 8.4291,
+      "step": 2192
+    },
+    {
+      "epoch": 0.0947136563876652,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.0009945574310466158,
+      "loss": 8.2569,
+      "step": 2193
+    },
+    {
+      "epoch": 0.09475684546946532,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.000994546917929765,
+      "loss": 8.2783,
+      "step": 2194
+    },
+    {
+      "epoch": 0.09480003455126544,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0009945363947245717,
+      "loss": 8.2944,
+      "step": 2195
+    },
+    {
+      "epoch": 0.09484322363306556,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0009945258614312501,
+      "loss": 8.2435,
+      "step": 2196
+    },
+    {
+      "epoch": 0.09488641271486568,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0009945153180500156,
+      "loss": 8.4701,
+      "step": 2197
+    },
+    {
+      "epoch": 0.0949296017966658,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0009945047645810831,
+      "loss": 8.3213,
+      "step": 2198
+    },
+    {
+      "epoch": 0.09497279087846593,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0009944942010246681,
+      "loss": 8.3223,
+      "step": 2199
+    },
+    {
+      "epoch": 0.09501597996026605,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0009944836273809857,
+      "loss": 8.3426,
+      "step": 2200
+    },
+    {
+      "epoch": 0.09501597996026605,
+      "eval_loss": 8.426615715026855,
+      "eval_runtime": 13.965,
+      "eval_samples_per_second": 1.719,
+      "eval_steps_per_second": 0.215,
+      "step": 2200
     }
   ],
   "logging_steps": 1,
       "attributes": {}
     }
   },
+  "total_flos": 7032412569600.0,
   "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null