Azrail commited on
Commit
bf30560
·
verified ·
1 Parent(s): 6e9ea4a

Training in progress, step 13000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5b83de3716293416e17f57907b1e6034054cf0cb82c7485e524b4d7d1450783b
3
  size 517931840
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b7373ac11401636769557d7c41bd131eaa1ff29f1ac0bd8ece04d73a85d45b3
3
  size 517931840
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a64cb29e942a69a8dc03ff6ac3a4e293f03dde8909732e3b914b2a3bf04f6716
3
  size 1035661434
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:600ff1a38a47f869ae5492791562a9ea82c55e0368079b5f56587277995a7652
3
  size 1035661434
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9e88c68399442716e4a372c4039d5dcf90ac56e28a588e1c0ea57e0e690737de
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa45bf7feccf57a31c0d1db361074f3cc8988037f2a20ad89dd89a197a5582fe
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8af59fb9ac4de4ac193b8a4959e006fc89e2686baafa42f4be575214da0ad2e3
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:314b286b574cdec8b8035ea2a5d06f7aaf8f954a409646e55b7a4304b27476aa
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.26359251891482255,
6
  "eval_steps": 500,
7
- "global_step": 12000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -2144,11 +2144,189 @@
2144
  "eval_steps_per_second": 19.086,
2145
  "num_input_tokens_seen": 12582912000,
2146
  "step": 12000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2147
  }
2148
  ],
2149
  "logging_steps": 50,
2150
  "max_steps": 200000,
2151
- "num_input_tokens_seen": 12582912000,
2152
  "num_train_epochs": 5,
2153
  "save_steps": 1000,
2154
  "stateful_callbacks": {
@@ -2163,7 +2341,7 @@
2163
  "attributes": {}
2164
  }
2165
  },
2166
- "total_flos": 7.166060591579136e+18,
2167
  "train_batch_size": 64,
2168
  "trial_name": null,
2169
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.2855585621577244,
6
  "eval_steps": 500,
7
+ "global_step": 13000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
2144
  "eval_steps_per_second": 19.086,
2145
  "num_input_tokens_seen": 12582912000,
2146
  "step": 12000
2147
+ },
2148
+ {
2149
+ "epoch": 0.2646908210769676,
2150
+ "grad_norm": 0.1443321257829666,
2151
+ "learning_rate": 0.001,
2152
+ "loss": 2.7866,
2153
+ "num_input_tokens_seen": 12635340800,
2154
+ "step": 12050
2155
+ },
2156
+ {
2157
+ "epoch": 0.26578912323911275,
2158
+ "grad_norm": 0.12249191850423813,
2159
+ "learning_rate": 0.001,
2160
+ "loss": 2.8,
2161
+ "num_input_tokens_seen": 12687769600,
2162
+ "step": 12100
2163
+ },
2164
+ {
2165
+ "epoch": 0.2668874254012578,
2166
+ "grad_norm": 0.1505623608827591,
2167
+ "learning_rate": 0.001,
2168
+ "loss": 2.7934,
2169
+ "num_input_tokens_seen": 12740198400,
2170
+ "step": 12150
2171
+ },
2172
+ {
2173
+ "epoch": 0.26798572756340294,
2174
+ "grad_norm": 0.17367833852767944,
2175
+ "learning_rate": 0.001,
2176
+ "loss": 2.7905,
2177
+ "num_input_tokens_seen": 12792627200,
2178
+ "step": 12200
2179
+ },
2180
+ {
2181
+ "epoch": 0.269084029725548,
2182
+ "grad_norm": 0.12189670652151108,
2183
+ "learning_rate": 0.001,
2184
+ "loss": 2.7878,
2185
+ "num_input_tokens_seen": 12845056000,
2186
+ "step": 12250
2187
+ },
2188
+ {
2189
+ "epoch": 0.27018233188769314,
2190
+ "grad_norm": 0.12834201753139496,
2191
+ "learning_rate": 0.001,
2192
+ "loss": 2.7822,
2193
+ "num_input_tokens_seen": 12897484800,
2194
+ "step": 12300
2195
+ },
2196
+ {
2197
+ "epoch": 0.2712806340498382,
2198
+ "grad_norm": 0.1277332305908203,
2199
+ "learning_rate": 0.001,
2200
+ "loss": 2.7846,
2201
+ "num_input_tokens_seen": 12949913600,
2202
+ "step": 12350
2203
+ },
2204
+ {
2205
+ "epoch": 0.2723789362119833,
2206
+ "grad_norm": 0.14190761744976044,
2207
+ "learning_rate": 0.001,
2208
+ "loss": 2.7845,
2209
+ "num_input_tokens_seen": 13002342400,
2210
+ "step": 12400
2211
+ },
2212
+ {
2213
+ "epoch": 0.2734772383741284,
2214
+ "grad_norm": 0.14843693375587463,
2215
+ "learning_rate": 0.001,
2216
+ "loss": 2.7847,
2217
+ "num_input_tokens_seen": 13054771200,
2218
+ "step": 12450
2219
+ },
2220
+ {
2221
+ "epoch": 0.2745755405362735,
2222
+ "grad_norm": 0.14427120983600616,
2223
+ "learning_rate": 0.001,
2224
+ "loss": 2.78,
2225
+ "num_input_tokens_seen": 13107200000,
2226
+ "step": 12500
2227
+ },
2228
+ {
2229
+ "epoch": 0.2745755405362735,
2230
+ "eval_loss": 2.6847124099731445,
2231
+ "eval_runtime": 65.0448,
2232
+ "eval_samples_per_second": 76.87,
2233
+ "eval_steps_per_second": 19.218,
2234
+ "num_input_tokens_seen": 13107200000,
2235
+ "step": 12500
2236
+ },
2237
+ {
2238
+ "epoch": 0.2756738426984186,
2239
+ "grad_norm": 0.14408434927463531,
2240
+ "learning_rate": 0.001,
2241
+ "loss": 2.7794,
2242
+ "num_input_tokens_seen": 13159628800,
2243
+ "step": 12550
2244
+ },
2245
+ {
2246
+ "epoch": 0.2767721448605637,
2247
+ "grad_norm": 0.1557396501302719,
2248
+ "learning_rate": 0.001,
2249
+ "loss": 2.7754,
2250
+ "num_input_tokens_seen": 13212057600,
2251
+ "step": 12600
2252
+ },
2253
+ {
2254
+ "epoch": 0.27787044702270874,
2255
+ "grad_norm": 0.11494632810354233,
2256
+ "learning_rate": 0.001,
2257
+ "loss": 2.7839,
2258
+ "num_input_tokens_seen": 13264486400,
2259
+ "step": 12650
2260
+ },
2261
+ {
2262
+ "epoch": 0.27896874918485387,
2263
+ "grad_norm": 0.12402207404375076,
2264
+ "learning_rate": 0.001,
2265
+ "loss": 2.7773,
2266
+ "num_input_tokens_seen": 13316915200,
2267
+ "step": 12700
2268
+ },
2269
+ {
2270
+ "epoch": 0.28006705134699894,
2271
+ "grad_norm": 0.1308801770210266,
2272
+ "learning_rate": 0.001,
2273
+ "loss": 2.7864,
2274
+ "num_input_tokens_seen": 13369344000,
2275
+ "step": 12750
2276
+ },
2277
+ {
2278
+ "epoch": 0.28116535350914407,
2279
+ "grad_norm": 0.13596223294734955,
2280
+ "learning_rate": 0.001,
2281
+ "loss": 2.7763,
2282
+ "num_input_tokens_seen": 13421772800,
2283
+ "step": 12800
2284
+ },
2285
+ {
2286
+ "epoch": 0.28226365567128914,
2287
+ "grad_norm": 0.13256165385246277,
2288
+ "learning_rate": 0.001,
2289
+ "loss": 2.7762,
2290
+ "num_input_tokens_seen": 13474201600,
2291
+ "step": 12850
2292
+ },
2293
+ {
2294
+ "epoch": 0.28336195783343426,
2295
+ "grad_norm": 0.12955094873905182,
2296
+ "learning_rate": 0.001,
2297
+ "loss": 2.7823,
2298
+ "num_input_tokens_seen": 13526630400,
2299
+ "step": 12900
2300
+ },
2301
+ {
2302
+ "epoch": 0.28446025999557933,
2303
+ "grad_norm": 0.13506431877613068,
2304
+ "learning_rate": 0.001,
2305
+ "loss": 2.774,
2306
+ "num_input_tokens_seen": 13579059200,
2307
+ "step": 12950
2308
+ },
2309
+ {
2310
+ "epoch": 0.2855585621577244,
2311
+ "grad_norm": 0.14323291182518005,
2312
+ "learning_rate": 0.001,
2313
+ "loss": 2.7755,
2314
+ "num_input_tokens_seen": 13631488000,
2315
+ "step": 13000
2316
+ },
2317
+ {
2318
+ "epoch": 0.2855585621577244,
2319
+ "eval_loss": 2.6779518127441406,
2320
+ "eval_runtime": 66.0334,
2321
+ "eval_samples_per_second": 75.719,
2322
+ "eval_steps_per_second": 18.93,
2323
+ "num_input_tokens_seen": 13631488000,
2324
+ "step": 13000
2325
  }
2326
  ],
2327
  "logging_steps": 50,
2328
  "max_steps": 200000,
2329
+ "num_input_tokens_seen": 13631488000,
2330
  "num_train_epochs": 5,
2331
  "save_steps": 1000,
2332
  "stateful_callbacks": {
 
2341
  "attributes": {}
2342
  }
2343
  },
2344
+ "total_flos": 7.763232307544064e+18,
2345
  "train_batch_size": 64,
2346
  "trial_name": null,
2347
  "trial_params": null