ljcamargo commited on
Commit
f7b177d
·
verified ·
1 Parent(s): 8c5a829

Training in progress, step 3300, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bce8f23fe3ca54ce3a2edb635dddee9a9c329ff98d2c1c1edfce5ae419285005
3
  size 3237829088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:365157e0cc91c6ea82754070aed20459af7616cdd87d96a38b0933e4ebe719a6
3
  size 3237829088
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d48dac810dbf197987947f41644e305ca6c790e2f1c59ed452325bdbb5ba36f8
3
  size 2062272049
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bca2f83c7c62bb2baeb05f97ac5a95135b02a1d4757160680bb94bbe4a6a7b0a
3
  size 2062272049
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5f53ea6cca8172a2d83848e49f423fb2d70a5aed099439177ccfef05efb329dc
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e2f931a6aed50b06e410ca372eb5f503052ee3f5905b5b560a45a62d502dc2ff
3
  size 14645
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fc83c3e7cbaa37ce1778897d2e62cb7b8c41ece6f6ed0596eecff4f3f73fcd86
3
  size 1383
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92459d776349d1cc2d4327d5ed9e474de76e06b8e6491efc16a39d8110d2a844
3
  size 1383
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:969f9b4cf700cdbd38bb540453f172b054b7fbd95efaee88d0519f169dc71ac6
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d134eca5097bea9b8988d832ccb2da62b2a551181674ab666a19da6cf129c3d
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.8609556607834696,
6
  "eval_steps": 300,
7
- "global_step": 3000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -2108,6 +2108,216 @@
2108
  "learning_rate": 9.943054290774756e-06,
2109
  "loss": 0.7574,
2110
  "step": 3000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2111
  }
2112
  ],
2113
  "logging_steps": 10,
@@ -2127,7 +2337,7 @@
2127
  "attributes": {}
2128
  }
2129
  },
2130
- "total_flos": 1.2275648299008e+20,
2131
  "train_batch_size": 6,
2132
  "trial_name": null,
2133
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.9470512268618166,
6
  "eval_steps": 300,
7
+ "global_step": 3300,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
2108
  "learning_rate": 9.943054290774756e-06,
2109
  "loss": 0.7574,
2110
  "step": 3000
2111
+ },
2112
+ {
2113
+ "epoch": 0.8638255129860812,
2114
+ "grad_norm": 5.900289535522461,
2115
+ "learning_rate": 9.547076661622922e-06,
2116
+ "loss": 0.7758,
2117
+ "step": 3010
2118
+ },
2119
+ {
2120
+ "epoch": 0.8666953651886928,
2121
+ "grad_norm": 5.241759777069092,
2122
+ "learning_rate": 9.15874942234024e-06,
2123
+ "loss": 0.7805,
2124
+ "step": 3020
2125
+ },
2126
+ {
2127
+ "epoch": 0.8695652173913043,
2128
+ "grad_norm": 4.609664440155029,
2129
+ "learning_rate": 8.778105417136395e-06,
2130
+ "loss": 0.7642,
2131
+ "step": 3030
2132
+ },
2133
+ {
2134
+ "epoch": 0.8724350695939159,
2135
+ "grad_norm": 6.470444202423096,
2136
+ "learning_rate": 8.405176840383122e-06,
2137
+ "loss": 0.7928,
2138
+ "step": 3040
2139
+ },
2140
+ {
2141
+ "epoch": 0.8753049217965275,
2142
+ "grad_norm": 3.531794786453247,
2143
+ "learning_rate": 8.039995233891362e-06,
2144
+ "loss": 0.7503,
2145
+ "step": 3050
2146
+ },
2147
+ {
2148
+ "epoch": 0.878174773999139,
2149
+ "grad_norm": 5.537559986114502,
2150
+ "learning_rate": 7.682591484243417e-06,
2151
+ "loss": 0.7343,
2152
+ "step": 3060
2153
+ },
2154
+ {
2155
+ "epoch": 0.8810446262017506,
2156
+ "grad_norm": 3.7967238426208496,
2157
+ "learning_rate": 7.332995820180677e-06,
2158
+ "loss": 0.7345,
2159
+ "step": 3070
2160
+ },
2161
+ {
2162
+ "epoch": 0.8839144784043622,
2163
+ "grad_norm": 4.1268839836120605,
2164
+ "learning_rate": 6.991237810046847e-06,
2165
+ "loss": 0.7557,
2166
+ "step": 3080
2167
+ },
2168
+ {
2169
+ "epoch": 0.8867843306069737,
2170
+ "grad_norm": 7.182312965393066,
2171
+ "learning_rate": 6.6573463592871085e-06,
2172
+ "loss": 0.7635,
2173
+ "step": 3090
2174
+ },
2175
+ {
2176
+ "epoch": 0.8896541828095853,
2177
+ "grad_norm": 3.4768388271331787,
2178
+ "learning_rate": 6.331349708003365e-06,
2179
+ "loss": 0.7325,
2180
+ "step": 3100
2181
+ },
2182
+ {
2183
+ "epoch": 0.8925240350121969,
2184
+ "grad_norm": 5.252262115478516,
2185
+ "learning_rate": 6.013275428565712e-06,
2186
+ "loss": 0.7513,
2187
+ "step": 3110
2188
+ },
2189
+ {
2190
+ "epoch": 0.8953938872148084,
2191
+ "grad_norm": 4.213047027587891,
2192
+ "learning_rate": 5.703150423280401e-06,
2193
+ "loss": 0.7685,
2194
+ "step": 3120
2195
+ },
2196
+ {
2197
+ "epoch": 0.89826373941742,
2198
+ "grad_norm": 4.207084655761719,
2199
+ "learning_rate": 5.401000922114485e-06,
2200
+ "loss": 0.7313,
2201
+ "step": 3130
2202
+ },
2203
+ {
2204
+ "epoch": 0.9011335916200316,
2205
+ "grad_norm": 6.862100124359131,
2206
+ "learning_rate": 5.10685248047732e-06,
2207
+ "loss": 0.7626,
2208
+ "step": 3140
2209
+ },
2210
+ {
2211
+ "epoch": 0.9040034438226431,
2212
+ "grad_norm": 3.541048049926758,
2213
+ "learning_rate": 4.82072997705908e-06,
2214
+ "loss": 0.7748,
2215
+ "step": 3150
2216
+ },
2217
+ {
2218
+ "epoch": 0.9068732960252547,
2219
+ "grad_norm": 4.149963855743408,
2220
+ "learning_rate": 4.542657611726664e-06,
2221
+ "loss": 0.7651,
2222
+ "step": 3160
2223
+ },
2224
+ {
2225
+ "epoch": 0.9097431482278663,
2226
+ "grad_norm": 6.455443859100342,
2227
+ "learning_rate": 4.272658903476745e-06,
2228
+ "loss": 0.7769,
2229
+ "step": 3170
2230
+ },
2231
+ {
2232
+ "epoch": 0.9126130004304779,
2233
+ "grad_norm": 5.111416339874268,
2234
+ "learning_rate": 4.010756688446726e-06,
2235
+ "loss": 0.779,
2236
+ "step": 3180
2237
+ },
2238
+ {
2239
+ "epoch": 0.9154828526330894,
2240
+ "grad_norm": 5.0384440422058105,
2241
+ "learning_rate": 3.7569731179831537e-06,
2242
+ "loss": 0.7353,
2243
+ "step": 3190
2244
+ },
2245
+ {
2246
+ "epoch": 0.918352704835701,
2247
+ "grad_norm": 4.619420528411865,
2248
+ "learning_rate": 3.5113296567682476e-06,
2249
+ "loss": 0.7686,
2250
+ "step": 3200
2251
+ },
2252
+ {
2253
+ "epoch": 0.9212225570383126,
2254
+ "grad_norm": 5.13969612121582,
2255
+ "learning_rate": 3.2738470810044553e-06,
2256
+ "loss": 0.7475,
2257
+ "step": 3210
2258
+ },
2259
+ {
2260
+ "epoch": 0.9240924092409241,
2261
+ "grad_norm": 4.138948917388916,
2262
+ "learning_rate": 3.0445454766572235e-06,
2263
+ "loss": 0.743,
2264
+ "step": 3220
2265
+ },
2266
+ {
2267
+ "epoch": 0.9269622614435357,
2268
+ "grad_norm": 3.4994235038757324,
2269
+ "learning_rate": 2.8234442377561232e-06,
2270
+ "loss": 0.7491,
2271
+ "step": 3230
2272
+ },
2273
+ {
2274
+ "epoch": 0.9298321136461473,
2275
+ "grad_norm": 3.714160442352295,
2276
+ "learning_rate": 2.6105620647545734e-06,
2277
+ "loss": 0.7516,
2278
+ "step": 3240
2279
+ },
2280
+ {
2281
+ "epoch": 0.9327019658487588,
2282
+ "grad_norm": 3.1646008491516113,
2283
+ "learning_rate": 2.4059169629481403e-06,
2284
+ "loss": 0.751,
2285
+ "step": 3250
2286
+ },
2287
+ {
2288
+ "epoch": 0.9355718180513704,
2289
+ "grad_norm": 4.828333377838135,
2290
+ "learning_rate": 2.209526240951665e-06,
2291
+ "loss": 0.741,
2292
+ "step": 3260
2293
+ },
2294
+ {
2295
+ "epoch": 0.938441670253982,
2296
+ "grad_norm": 3.3315179347991943,
2297
+ "learning_rate": 2.021406509235402e-06,
2298
+ "loss": 0.7554,
2299
+ "step": 3270
2300
+ },
2301
+ {
2302
+ "epoch": 0.9413115224565934,
2303
+ "grad_norm": 6.141576766967773,
2304
+ "learning_rate": 1.8415736787200433e-06,
2305
+ "loss": 0.7465,
2306
+ "step": 3280
2307
+ },
2308
+ {
2309
+ "epoch": 0.944181374659205,
2310
+ "grad_norm": 4.839749336242676,
2311
+ "learning_rate": 1.6700429594310063e-06,
2312
+ "loss": 0.761,
2313
+ "step": 3290
2314
+ },
2315
+ {
2316
+ "epoch": 0.9470512268618166,
2317
+ "grad_norm": 4.683228969573975,
2318
+ "learning_rate": 1.5068288592120283e-06,
2319
+ "loss": 0.751,
2320
+ "step": 3300
2321
  }
2322
  ],
2323
  "logging_steps": 10,
 
2337
  "attributes": {}
2338
  }
2339
  },
2340
+ "total_flos": 1.35032131289088e+20,
2341
  "train_batch_size": 6,
2342
  "trial_name": null,
2343
  "trial_params": null