irodkin commited on
Commit
ded0b6b
·
verified ·
1 Parent(s): 7866a9b

Training checkpoint at step 6500

Browse files
Files changed (1) hide show
  1. trainer_state.json +186 -6
trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "best_global_step": 6000,
3
- "best_metric": 2.594125509262085,
4
- "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/google/gemma-3-1b-it/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_34/checkpoint-6000",
5
- "epoch": 0.12,
6
  "eval_steps": 100,
7
- "global_step": 6000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -2168,6 +2168,186 @@
2168
  "eval_samples_per_second": 2.417,
2169
  "eval_steps_per_second": 1.208,
2170
  "step": 6000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2171
  }
2172
  ],
2173
  "logging_steps": 25,
@@ -2187,7 +2367,7 @@
2187
  "attributes": {}
2188
  }
2189
  },
2190
- "total_flos": 1.3465316020260438e+19,
2191
  "train_batch_size": 1,
2192
  "trial_name": null,
2193
  "trial_params": null
 
1
  {
2
+ "best_global_step": 6500,
3
+ "best_metric": 2.585561990737915,
4
+ "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/google/gemma-3-1b-it/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_34/checkpoint-6500",
5
+ "epoch": 0.13,
6
  "eval_steps": 100,
7
+ "global_step": 6500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
2168
  "eval_samples_per_second": 2.417,
2169
  "eval_steps_per_second": 1.208,
2170
  "step": 6000
2171
+ },
2172
+ {
2173
+ "epoch": 0.1205,
2174
+ "grad_norm": 2.1932467610128166,
2175
+ "learning_rate": 9.772444444444445e-06,
2176
+ "loss": 2.5892,
2177
+ "step": 6025
2178
+ },
2179
+ {
2180
+ "epoch": 0.121,
2181
+ "grad_norm": 2.387425729108963,
2182
+ "learning_rate": 9.76688888888889e-06,
2183
+ "loss": 2.5918,
2184
+ "step": 6050
2185
+ },
2186
+ {
2187
+ "epoch": 0.1215,
2188
+ "grad_norm": 2.8624591702116313,
2189
+ "learning_rate": 9.761333333333334e-06,
2190
+ "loss": 2.5875,
2191
+ "step": 6075
2192
+ },
2193
+ {
2194
+ "epoch": 0.122,
2195
+ "grad_norm": 2.930012610934339,
2196
+ "learning_rate": 9.755777777777778e-06,
2197
+ "loss": 2.5906,
2198
+ "step": 6100
2199
+ },
2200
+ {
2201
+ "epoch": 0.122,
2202
+ "eval_loss": 2.592097282409668,
2203
+ "eval_runtime": 42.1118,
2204
+ "eval_samples_per_second": 2.47,
2205
+ "eval_steps_per_second": 1.235,
2206
+ "step": 6100
2207
+ },
2208
+ {
2209
+ "epoch": 0.1225,
2210
+ "grad_norm": 3.6585883804987596,
2211
+ "learning_rate": 9.750222222222223e-06,
2212
+ "loss": 2.5888,
2213
+ "step": 6125
2214
+ },
2215
+ {
2216
+ "epoch": 0.123,
2217
+ "grad_norm": 2.9636602337569213,
2218
+ "learning_rate": 9.744666666666668e-06,
2219
+ "loss": 2.5848,
2220
+ "step": 6150
2221
+ },
2222
+ {
2223
+ "epoch": 0.1235,
2224
+ "grad_norm": 2.6452546886265242,
2225
+ "learning_rate": 9.739111111111112e-06,
2226
+ "loss": 2.5875,
2227
+ "step": 6175
2228
+ },
2229
+ {
2230
+ "epoch": 0.124,
2231
+ "grad_norm": 2.230890007256631,
2232
+ "learning_rate": 9.733555555555555e-06,
2233
+ "loss": 2.5928,
2234
+ "step": 6200
2235
+ },
2236
+ {
2237
+ "epoch": 0.124,
2238
+ "eval_loss": 2.591871976852417,
2239
+ "eval_runtime": 42.2393,
2240
+ "eval_samples_per_second": 2.462,
2241
+ "eval_steps_per_second": 1.231,
2242
+ "step": 6200
2243
+ },
2244
+ {
2245
+ "epoch": 0.1245,
2246
+ "grad_norm": 2.2263966783946643,
2247
+ "learning_rate": 9.728e-06,
2248
+ "loss": 2.5913,
2249
+ "step": 6225
2250
+ },
2251
+ {
2252
+ "epoch": 0.125,
2253
+ "grad_norm": 3.0917521864623168,
2254
+ "learning_rate": 9.722444444444446e-06,
2255
+ "loss": 2.5858,
2256
+ "step": 6250
2257
+ },
2258
+ {
2259
+ "epoch": 0.1255,
2260
+ "grad_norm": 3.406162518240377,
2261
+ "learning_rate": 9.71688888888889e-06,
2262
+ "loss": 2.5824,
2263
+ "step": 6275
2264
+ },
2265
+ {
2266
+ "epoch": 0.126,
2267
+ "grad_norm": 1.9288658675383707,
2268
+ "learning_rate": 9.711333333333333e-06,
2269
+ "loss": 2.5881,
2270
+ "step": 6300
2271
+ },
2272
+ {
2273
+ "epoch": 0.126,
2274
+ "eval_loss": 2.588792085647583,
2275
+ "eval_runtime": 42.1993,
2276
+ "eval_samples_per_second": 2.464,
2277
+ "eval_steps_per_second": 1.232,
2278
+ "step": 6300
2279
+ },
2280
+ {
2281
+ "epoch": 0.1265,
2282
+ "grad_norm": 2.3054152552517557,
2283
+ "learning_rate": 9.705777777777778e-06,
2284
+ "loss": 2.5777,
2285
+ "step": 6325
2286
+ },
2287
+ {
2288
+ "epoch": 0.127,
2289
+ "grad_norm": 2.4215099152732438,
2290
+ "learning_rate": 9.700222222222224e-06,
2291
+ "loss": 2.5905,
2292
+ "step": 6350
2293
+ },
2294
+ {
2295
+ "epoch": 0.1275,
2296
+ "grad_norm": 2.1008082850001584,
2297
+ "learning_rate": 9.694666666666667e-06,
2298
+ "loss": 2.5891,
2299
+ "step": 6375
2300
+ },
2301
+ {
2302
+ "epoch": 0.128,
2303
+ "grad_norm": 2.548161937775528,
2304
+ "learning_rate": 9.68911111111111e-06,
2305
+ "loss": 2.5828,
2306
+ "step": 6400
2307
+ },
2308
+ {
2309
+ "epoch": 0.128,
2310
+ "eval_loss": 2.588566780090332,
2311
+ "eval_runtime": 42.2757,
2312
+ "eval_samples_per_second": 2.46,
2313
+ "eval_steps_per_second": 1.23,
2314
+ "step": 6400
2315
+ },
2316
+ {
2317
+ "epoch": 0.1285,
2318
+ "grad_norm": 2.1721864313913555,
2319
+ "learning_rate": 9.683555555555556e-06,
2320
+ "loss": 2.585,
2321
+ "step": 6425
2322
+ },
2323
+ {
2324
+ "epoch": 0.129,
2325
+ "grad_norm": 2.6656100643358567,
2326
+ "learning_rate": 9.678000000000001e-06,
2327
+ "loss": 2.5859,
2328
+ "step": 6450
2329
+ },
2330
+ {
2331
+ "epoch": 0.1295,
2332
+ "grad_norm": 2.14442087538069,
2333
+ "learning_rate": 9.672444444444445e-06,
2334
+ "loss": 2.5897,
2335
+ "step": 6475
2336
+ },
2337
+ {
2338
+ "epoch": 0.13,
2339
+ "grad_norm": 2.544695719649347,
2340
+ "learning_rate": 9.66688888888889e-06,
2341
+ "loss": 2.5819,
2342
+ "step": 6500
2343
+ },
2344
+ {
2345
+ "epoch": 0.13,
2346
+ "eval_loss": 2.585561990737915,
2347
+ "eval_runtime": 42.2362,
2348
+ "eval_samples_per_second": 2.462,
2349
+ "eval_steps_per_second": 1.231,
2350
+ "step": 6500
2351
  }
2352
  ],
2353
  "logging_steps": 25,
 
2367
  "attributes": {}
2368
  }
2369
  },
2370
+ "total_flos": 1.4587425704363688e+19,
2371
  "train_batch_size": 1,
2372
  "trial_name": null,
2373
  "trial_params": null