aghatage commited on
Commit
5ef793c
·
verified ·
1 Parent(s): 4126143

Training in progress, step 6000, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e175cfbb3b1e5047d2a07a1f65e6011d48b21f6ac86f4b54bb7a003b3e25ddd9
3
  size 12017472
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18b75b4abf894259d125060201a5ccf51810364d91fd3cea0d60d17a6403b9f6
3
  size 12017472
last-checkpoint/global_step6000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f20cbcc97cfbc77c9b6781af3fb327a8e590f45e5e58ba4784769f47e7581826
3
+ size 71982309
last-checkpoint/global_step6000/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4542b69e5e21b1b4b16e2e496d3d3499a0b7f568477280db01e3d784c01ad1d0
3
+ size 146356645
last-checkpoint/latest CHANGED
@@ -1 +1 @@
1
- global_step5500
 
1
+ global_step6000
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c75d63279b47e795ad4622a2e3404a0983cd22c3e120a053ebabf9c78e50af21
3
  size 14709
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8941841568d79a9d2dc41b93071be9e47935d7e9eda669b8a7caaabc7faae599
3
  size 14709
last-checkpoint/trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "best_global_step": 5500,
3
- "best_metric": 0.5829094648361206,
4
- "best_model_checkpoint": "/root/leap-finetune/outputs/sft/lfm2_350m_marathi_optimized_12ep/checkpoint-5500",
5
- "epoch": 3.997636793310307,
6
  "eval_steps": 250,
7
- "global_step": 5500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -2217,6 +2217,206 @@
2217
  "eval_samples_per_second": 43.492,
2218
  "eval_steps_per_second": 5.443,
2219
  "step": 5500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2220
  }
2221
  ],
2222
  "logging_steps": 25,
@@ -2236,7 +2436,7 @@
2236
  "attributes": {}
2237
  }
2238
  },
2239
- "total_flos": 3.0548540768282214e+17,
2240
  "train_batch_size": 4,
2241
  "trial_name": null,
2242
  "trial_params": null
 
1
  {
2
+ "best_global_step": 6000,
3
+ "best_metric": 0.578772783279419,
4
+ "best_model_checkpoint": "/root/leap-finetune/outputs/sft/lfm2_350m_marathi_optimized_12ep/checkpoint-6000",
5
+ "epoch": 4.360661697873114,
6
  "eval_steps": 250,
7
+ "global_step": 6000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
2217
  "eval_samples_per_second": 43.492,
2218
  "eval_steps_per_second": 5.443,
2219
  "step": 5500
2220
+ },
2221
+ {
2222
+ "epoch": 4.0152699509180145,
2223
+ "grad_norm": 0.7754128575325012,
2224
+ "learning_rate": 6.030204764247823e-05,
2225
+ "loss": 0.5672,
2226
+ "mean_token_accuracy": 0.825141193633227,
2227
+ "num_tokens": 121668495.0,
2228
+ "step": 5525
2229
+ },
2230
+ {
2231
+ "epoch": 4.033448463915652,
2232
+ "grad_norm": 0.8203994631767273,
2233
+ "learning_rate": 6.0136883795470986e-05,
2234
+ "loss": 0.5715,
2235
+ "mean_token_accuracy": 0.8235750555992126,
2236
+ "num_tokens": 122202715.0,
2237
+ "step": 5550
2238
+ },
2239
+ {
2240
+ "epoch": 4.051626976913289,
2241
+ "grad_norm": 0.7772579193115234,
2242
+ "learning_rate": 5.997125879212641e-05,
2243
+ "loss": 0.567,
2244
+ "mean_token_accuracy": 0.8232873624563217,
2245
+ "num_tokens": 122751905.0,
2246
+ "step": 5575
2247
+ },
2248
+ {
2249
+ "epoch": 4.069805489910926,
2250
+ "grad_norm": 0.7624217867851257,
2251
+ "learning_rate": 5.9805176425435554e-05,
2252
+ "loss": 0.5715,
2253
+ "mean_token_accuracy": 0.8219173887372017,
2254
+ "num_tokens": 123296741.0,
2255
+ "step": 5600
2256
+ },
2257
+ {
2258
+ "epoch": 4.087984002908562,
2259
+ "grad_norm": 0.8489812612533569,
2260
+ "learning_rate": 5.963864049886357e-05,
2261
+ "loss": 0.5643,
2262
+ "mean_token_accuracy": 0.8252894932031631,
2263
+ "num_tokens": 123839156.0,
2264
+ "step": 5625
2265
+ },
2266
+ {
2267
+ "epoch": 4.106162515906199,
2268
+ "grad_norm": 0.7331461906433105,
2269
+ "learning_rate": 5.947165482626263e-05,
2270
+ "loss": 0.5717,
2271
+ "mean_token_accuracy": 0.8214939990639687,
2272
+ "num_tokens": 124404023.0,
2273
+ "step": 5650
2274
+ },
2275
+ {
2276
+ "epoch": 4.124341028903836,
2277
+ "grad_norm": 0.8406916856765747,
2278
+ "learning_rate": 5.930422323178458e-05,
2279
+ "loss": 0.5707,
2280
+ "mean_token_accuracy": 0.8230401134490967,
2281
+ "num_tokens": 124946377.0,
2282
+ "step": 5675
2283
+ },
2284
+ {
2285
+ "epoch": 4.142519541901472,
2286
+ "grad_norm": 0.8017742037773132,
2287
+ "learning_rate": 5.9136349549793323e-05,
2288
+ "loss": 0.5714,
2289
+ "mean_token_accuracy": 0.8239132612943649,
2290
+ "num_tokens": 125485916.0,
2291
+ "step": 5700
2292
+ },
2293
+ {
2294
+ "epoch": 4.160698054899109,
2295
+ "grad_norm": 0.7596368789672852,
2296
+ "learning_rate": 5.89680376247771e-05,
2297
+ "loss": 0.567,
2298
+ "mean_token_accuracy": 0.8242393881082535,
2299
+ "num_tokens": 126031951.0,
2300
+ "step": 5725
2301
+ },
2302
+ {
2303
+ "epoch": 4.178876567896746,
2304
+ "grad_norm": 0.8254388570785522,
2305
+ "learning_rate": 5.879929131126035e-05,
2306
+ "loss": 0.5663,
2307
+ "mean_token_accuracy": 0.8245700207352639,
2308
+ "num_tokens": 126583391.0,
2309
+ "step": 5750
2310
+ },
2311
+ {
2312
+ "epoch": 4.178876567896746,
2313
+ "eval_loss": 0.5802226662635803,
2314
+ "eval_mean_token_accuracy": 0.8190604671348933,
2315
+ "eval_num_tokens": 126583391.0,
2316
+ "eval_runtime": 113.2756,
2317
+ "eval_samples_per_second": 43.169,
2318
+ "eval_steps_per_second": 5.403,
2319
+ "step": 5750
2320
+ },
2321
+ {
2322
+ "epoch": 4.1970550808943825,
2323
+ "grad_norm": 0.7768835425376892,
2324
+ "learning_rate": 5.8630114473715466e-05,
2325
+ "loss": 0.571,
2326
+ "mean_token_accuracy": 0.822128147482872,
2327
+ "num_tokens": 127141535.0,
2328
+ "step": 5775
2329
+ },
2330
+ {
2331
+ "epoch": 4.21523359389202,
2332
+ "grad_norm": 0.800995945930481,
2333
+ "learning_rate": 5.846051098647433e-05,
2334
+ "loss": 0.566,
2335
+ "mean_token_accuracy": 0.8245965147018433,
2336
+ "num_tokens": 127703604.0,
2337
+ "step": 5800
2338
+ },
2339
+ {
2340
+ "epoch": 4.233412106889657,
2341
+ "grad_norm": 0.8068668246269226,
2342
+ "learning_rate": 5.8290484733639566e-05,
2343
+ "loss": 0.5738,
2344
+ "mean_token_accuracy": 0.8229434779286384,
2345
+ "num_tokens": 128229875.0,
2346
+ "step": 5825
2347
+ },
2348
+ {
2349
+ "epoch": 4.251590619887293,
2350
+ "grad_norm": 0.7782077789306641,
2351
+ "learning_rate": 5.812003960899557e-05,
2352
+ "loss": 0.5698,
2353
+ "mean_token_accuracy": 0.8222619444131851,
2354
+ "num_tokens": 128785210.0,
2355
+ "step": 5850
2356
+ },
2357
+ {
2358
+ "epoch": 4.26976913288493,
2359
+ "grad_norm": 0.7673845887184143,
2360
+ "learning_rate": 5.7949179515919366e-05,
2361
+ "loss": 0.5725,
2362
+ "mean_token_accuracy": 0.8225021129846573,
2363
+ "num_tokens": 129340256.0,
2364
+ "step": 5875
2365
+ },
2366
+ {
2367
+ "epoch": 4.287947645882567,
2368
+ "grad_norm": 0.7331883311271667,
2369
+ "learning_rate": 5.777790836729117e-05,
2370
+ "loss": 0.5642,
2371
+ "mean_token_accuracy": 0.8251486110687256,
2372
+ "num_tokens": 129879617.0,
2373
+ "step": 5900
2374
+ },
2375
+ {
2376
+ "epoch": 4.306126158880204,
2377
+ "grad_norm": 0.8080233931541443,
2378
+ "learning_rate": 5.760623008540487e-05,
2379
+ "loss": 0.5637,
2380
+ "mean_token_accuracy": 0.8252076309919357,
2381
+ "num_tokens": 130438521.0,
2382
+ "step": 5925
2383
+ },
2384
+ {
2385
+ "epoch": 4.32430467187784,
2386
+ "grad_norm": 0.8210439085960388,
2387
+ "learning_rate": 5.743414860187809e-05,
2388
+ "loss": 0.5667,
2389
+ "mean_token_accuracy": 0.8240964418649673,
2390
+ "num_tokens": 130988259.0,
2391
+ "step": 5950
2392
+ },
2393
+ {
2394
+ "epoch": 4.342483184875477,
2395
+ "grad_norm": 0.7626243233680725,
2396
+ "learning_rate": 5.726166785756224e-05,
2397
+ "loss": 0.5722,
2398
+ "mean_token_accuracy": 0.822701002061367,
2399
+ "num_tokens": 131539988.0,
2400
+ "step": 5975
2401
+ },
2402
+ {
2403
+ "epoch": 4.360661697873114,
2404
+ "grad_norm": 0.8430207967758179,
2405
+ "learning_rate": 5.708879180245222e-05,
2406
+ "loss": 0.5656,
2407
+ "mean_token_accuracy": 0.8251918998360633,
2408
+ "num_tokens": 132079465.0,
2409
+ "step": 6000
2410
+ },
2411
+ {
2412
+ "epoch": 4.360661697873114,
2413
+ "eval_loss": 0.578772783279419,
2414
+ "eval_mean_token_accuracy": 0.8196869551940681,
2415
+ "eval_num_tokens": 132079465.0,
2416
+ "eval_runtime": 112.7171,
2417
+ "eval_samples_per_second": 43.383,
2418
+ "eval_steps_per_second": 5.43,
2419
+ "step": 6000
2420
  }
2421
  ],
2422
  "logging_steps": 25,
 
2436
  "attributes": {}
2437
  }
2438
  },
2439
+ "total_flos": 3.3318287854639514e+17,
2440
  "train_batch_size": 4,
2441
  "trial_name": null,
2442
  "trial_params": null