NBAmine commited on
Commit
9d3beb4
·
verified ·
1 Parent(s): 6d1015d

Training in progress, step 2400, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c9e46ae2720088669da0e7f9e660e9df21b3f13cd814ef2c054173a76a40c0a8
3
  size 228140600
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d4ddfdb9e3869897cc8e2c794340a2005ba76c5f50e34e53325b8ac99f6dc318
3
  size 228140600
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:76b3121a237388d42068dd86668509dc36abd8695d8ccbfd6fb7b924e1a73d7f
3
  size 117931203
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e830b2069082bb840c6d5f287e7770c0cf0e2e3f80198ae73bbef00b14811db
3
  size 117931203
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6df16b3659f33d85607b74fb7cdd42ccb03ca1d0dc5313a9352883e092924860
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e39d866cd1fc861fe2c47687364cde08217b0454e6f5ff3c9a3af4b1571fdbed
3
  size 14645
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1ed5fdd6f9fe5f0de5d43635eeeee3253ccf660833d7fe6d9be640b40bec6bbe
3
  size 1383
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:137d8a11890df77c4e1b6a4687bee089955dbcdddb421d49b265e762ccebb1d2
3
  size 1383
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fc835731ce73222513c24c9953cdc95225ff0e18509f3befa431f270d3d03450
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a3c006c2c7c0bc33914c8e11069f53d495f2eafa42ba0a076cb7cebbe066c7a
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": 750,
3
  "best_metric": 0.5089643597602844,
4
  "best_model_checkpoint": "./adapter-phase1/checkpoint-750",
5
- "epoch": 3.36,
6
  "eval_steps": 300,
7
- "global_step": 2100,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -2252,6 +2252,318 @@
2252
  "eval_samples_per_second": 2.3,
2253
  "eval_steps_per_second": 0.575,
2254
  "step": 2100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2255
  }
2256
  ],
2257
  "logging_steps": 10,
@@ -2271,7 +2583,7 @@
2271
  "attributes": {}
2272
  }
2273
  },
2274
- "total_flos": 3.6234506980141056e+17,
2275
  "train_batch_size": 1,
2276
  "trial_name": null,
2277
  "trial_params": null
 
2
  "best_global_step": 750,
3
  "best_metric": 0.5089643597602844,
4
  "best_model_checkpoint": "./adapter-phase1/checkpoint-750",
5
+ "epoch": 3.84,
6
  "eval_steps": 300,
7
+ "global_step": 2400,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
2252
  "eval_samples_per_second": 2.3,
2253
  "eval_steps_per_second": 0.575,
2254
  "step": 2100
2255
+ },
2256
+ {
2257
+ "entropy": 0.3517730229534209,
2258
+ "epoch": 3.376,
2259
+ "grad_norm": 0.6908054947853088,
2260
+ "learning_rate": 3.2544000000000006e-05,
2261
+ "loss": 0.3057,
2262
+ "mean_token_accuracy": 0.9103573642671108,
2263
+ "num_tokens": 266432.0,
2264
+ "step": 2110
2265
+ },
2266
+ {
2267
+ "entropy": 0.38618900515139104,
2268
+ "epoch": 3.392,
2269
+ "grad_norm": 0.9056383967399597,
2270
+ "learning_rate": 3.2224e-05,
2271
+ "loss": 0.3188,
2272
+ "mean_token_accuracy": 0.9076898027211427,
2273
+ "num_tokens": 282655.0,
2274
+ "step": 2120
2275
+ },
2276
+ {
2277
+ "entropy": 0.3537537831813097,
2278
+ "epoch": 3.408,
2279
+ "grad_norm": 0.48644715547561646,
2280
+ "learning_rate": 3.1904e-05,
2281
+ "loss": 0.2886,
2282
+ "mean_token_accuracy": 0.9162093725055456,
2283
+ "num_tokens": 310801.0,
2284
+ "step": 2130
2285
+ },
2286
+ {
2287
+ "entropy": 0.26729877749457953,
2288
+ "epoch": 3.424,
2289
+ "grad_norm": 0.6074755787849426,
2290
+ "learning_rate": 3.1584e-05,
2291
+ "loss": 0.2371,
2292
+ "mean_token_accuracy": 0.9263024788349867,
2293
+ "num_tokens": 343555.0,
2294
+ "step": 2140
2295
+ },
2296
+ {
2297
+ "entropy": 0.25955253606662154,
2298
+ "epoch": 3.44,
2299
+ "grad_norm": 0.8773949146270752,
2300
+ "learning_rate": 3.1264e-05,
2301
+ "loss": 0.2227,
2302
+ "mean_token_accuracy": 0.9337353933602571,
2303
+ "num_tokens": 369134.0,
2304
+ "step": 2150
2305
+ },
2306
+ {
2307
+ "entropy": 0.27338800597935914,
2308
+ "epoch": 3.456,
2309
+ "grad_norm": 0.7504522204399109,
2310
+ "learning_rate": 3.0975999999999996e-05,
2311
+ "loss": 0.2261,
2312
+ "mean_token_accuracy": 0.9332862004637719,
2313
+ "num_tokens": 390152.0,
2314
+ "step": 2160
2315
+ },
2316
+ {
2317
+ "entropy": 0.30181694105267526,
2318
+ "epoch": 3.472,
2319
+ "grad_norm": 0.8649200201034546,
2320
+ "learning_rate": 3.0656e-05,
2321
+ "loss": 0.2289,
2322
+ "mean_token_accuracy": 0.9334215141832829,
2323
+ "num_tokens": 406222.0,
2324
+ "step": 2170
2325
+ },
2326
+ {
2327
+ "entropy": 0.28406244921498003,
2328
+ "epoch": 3.488,
2329
+ "grad_norm": 1.9269925355911255,
2330
+ "learning_rate": 3.0336000000000002e-05,
2331
+ "loss": 0.2353,
2332
+ "mean_token_accuracy": 0.9303826864808797,
2333
+ "num_tokens": 434767.0,
2334
+ "step": 2180
2335
+ },
2336
+ {
2337
+ "entropy": 0.2358154426328838,
2338
+ "epoch": 3.504,
2339
+ "grad_norm": 0.7775760293006897,
2340
+ "learning_rate": 3.0016e-05,
2341
+ "loss": 0.2277,
2342
+ "mean_token_accuracy": 0.9293628957122564,
2343
+ "num_tokens": 467498.0,
2344
+ "step": 2190
2345
+ },
2346
+ {
2347
+ "entropy": 0.2596265008673072,
2348
+ "epoch": 3.52,
2349
+ "grad_norm": 0.7286163568496704,
2350
+ "learning_rate": 2.9696e-05,
2351
+ "loss": 0.2266,
2352
+ "mean_token_accuracy": 0.9321592267602682,
2353
+ "num_tokens": 493146.0,
2354
+ "step": 2200
2355
+ },
2356
+ {
2357
+ "entropy": 0.28550293026492,
2358
+ "epoch": 3.536,
2359
+ "grad_norm": 0.7693914175033569,
2360
+ "learning_rate": 2.9376000000000005e-05,
2361
+ "loss": 0.2291,
2362
+ "mean_token_accuracy": 0.9351058643311262,
2363
+ "num_tokens": 513926.0,
2364
+ "step": 2210
2365
+ },
2366
+ {
2367
+ "entropy": 0.2885140863247216,
2368
+ "epoch": 3.552,
2369
+ "grad_norm": 1.1927505731582642,
2370
+ "learning_rate": 2.9056e-05,
2371
+ "loss": 0.219,
2372
+ "mean_token_accuracy": 0.9396381825208664,
2373
+ "num_tokens": 530263.0,
2374
+ "step": 2220
2375
+ },
2376
+ {
2377
+ "entropy": 0.283741835039109,
2378
+ "epoch": 3.568,
2379
+ "grad_norm": 0.6537899971008301,
2380
+ "learning_rate": 2.8736e-05,
2381
+ "loss": 0.2324,
2382
+ "mean_token_accuracy": 0.9302929677069187,
2383
+ "num_tokens": 559791.0,
2384
+ "step": 2230
2385
+ },
2386
+ {
2387
+ "entropy": 0.2369093818590045,
2388
+ "epoch": 3.584,
2389
+ "grad_norm": 0.793480396270752,
2390
+ "learning_rate": 2.8416000000000004e-05,
2391
+ "loss": 0.2165,
2392
+ "mean_token_accuracy": 0.9320364937186241,
2393
+ "num_tokens": 592398.0,
2394
+ "step": 2240
2395
+ },
2396
+ {
2397
+ "entropy": 0.264733817987144,
2398
+ "epoch": 3.6,
2399
+ "grad_norm": 0.7945203185081482,
2400
+ "learning_rate": 2.8096e-05,
2401
+ "loss": 0.2337,
2402
+ "mean_token_accuracy": 0.9294226188212633,
2403
+ "num_tokens": 617982.0,
2404
+ "step": 2250
2405
+ },
2406
+ {
2407
+ "entropy": 0.2889886857941747,
2408
+ "epoch": 3.616,
2409
+ "grad_norm": 0.7558261752128601,
2410
+ "learning_rate": 2.7776000000000003e-05,
2411
+ "loss": 0.2305,
2412
+ "mean_token_accuracy": 0.9317790925502777,
2413
+ "num_tokens": 639115.0,
2414
+ "step": 2260
2415
+ },
2416
+ {
2417
+ "entropy": 0.28708559228107333,
2418
+ "epoch": 3.632,
2419
+ "grad_norm": 0.6877163648605347,
2420
+ "learning_rate": 2.7456000000000003e-05,
2421
+ "loss": 0.2215,
2422
+ "mean_token_accuracy": 0.9357377961277962,
2423
+ "num_tokens": 655709.0,
2424
+ "step": 2270
2425
+ },
2426
+ {
2427
+ "entropy": 0.28660596534609795,
2428
+ "epoch": 3.648,
2429
+ "grad_norm": 0.6599491238594055,
2430
+ "learning_rate": 2.7136e-05,
2431
+ "loss": 0.2363,
2432
+ "mean_token_accuracy": 0.928611570596695,
2433
+ "num_tokens": 684500.0,
2434
+ "step": 2280
2435
+ },
2436
+ {
2437
+ "entropy": 0.23836621949449183,
2438
+ "epoch": 3.664,
2439
+ "grad_norm": 0.7436323165893555,
2440
+ "learning_rate": 2.6816000000000002e-05,
2441
+ "loss": 0.2194,
2442
+ "mean_token_accuracy": 0.9314162913709879,
2443
+ "num_tokens": 717271.0,
2444
+ "step": 2290
2445
+ },
2446
+ {
2447
+ "entropy": 0.27099227644503115,
2448
+ "epoch": 3.68,
2449
+ "grad_norm": 0.7519745826721191,
2450
+ "learning_rate": 2.6496e-05,
2451
+ "loss": 0.2369,
2452
+ "mean_token_accuracy": 0.9278060872107744,
2453
+ "num_tokens": 743068.0,
2454
+ "step": 2300
2455
+ },
2456
+ {
2457
+ "entropy": 0.282380092702806,
2458
+ "epoch": 3.6959999999999997,
2459
+ "grad_norm": 0.7645207643508911,
2460
+ "learning_rate": 2.6176e-05,
2461
+ "loss": 0.2175,
2462
+ "mean_token_accuracy": 0.9372334524989128,
2463
+ "num_tokens": 763925.0,
2464
+ "step": 2310
2465
+ },
2466
+ {
2467
+ "entropy": 0.2850790939293802,
2468
+ "epoch": 3.7119999999999997,
2469
+ "grad_norm": 0.9016556143760681,
2470
+ "learning_rate": 2.5856e-05,
2471
+ "loss": 0.217,
2472
+ "mean_token_accuracy": 0.9392455574125051,
2473
+ "num_tokens": 780111.0,
2474
+ "step": 2320
2475
+ },
2476
+ {
2477
+ "entropy": 0.2691464308649302,
2478
+ "epoch": 3.7279999999999998,
2479
+ "grad_norm": 0.77091383934021,
2480
+ "learning_rate": 2.5535999999999997e-05,
2481
+ "loss": 0.2334,
2482
+ "mean_token_accuracy": 0.929338139295578,
2483
+ "num_tokens": 808661.0,
2484
+ "step": 2330
2485
+ },
2486
+ {
2487
+ "entropy": 0.2395469973795116,
2488
+ "epoch": 3.7439999999999998,
2489
+ "grad_norm": 0.7632396221160889,
2490
+ "learning_rate": 2.5216e-05,
2491
+ "loss": 0.2148,
2492
+ "mean_token_accuracy": 0.9322273649275303,
2493
+ "num_tokens": 840932.0,
2494
+ "step": 2340
2495
+ },
2496
+ {
2497
+ "entropy": 0.2645680231973529,
2498
+ "epoch": 3.76,
2499
+ "grad_norm": 0.819273054599762,
2500
+ "learning_rate": 2.4896e-05,
2501
+ "loss": 0.226,
2502
+ "mean_token_accuracy": 0.930556321516633,
2503
+ "num_tokens": 866564.0,
2504
+ "step": 2350
2505
+ },
2506
+ {
2507
+ "entropy": 0.2808503101579845,
2508
+ "epoch": 3.776,
2509
+ "grad_norm": 0.8598120808601379,
2510
+ "learning_rate": 2.4576000000000003e-05,
2511
+ "loss": 0.2215,
2512
+ "mean_token_accuracy": 0.9356644533574581,
2513
+ "num_tokens": 887527.0,
2514
+ "step": 2360
2515
+ },
2516
+ {
2517
+ "entropy": 0.28694011168554423,
2518
+ "epoch": 3.792,
2519
+ "grad_norm": 1.0404748916625977,
2520
+ "learning_rate": 2.4256e-05,
2521
+ "loss": 0.214,
2522
+ "mean_token_accuracy": 0.9388030290603637,
2523
+ "num_tokens": 903688.0,
2524
+ "step": 2370
2525
+ },
2526
+ {
2527
+ "entropy": 0.2774578414391726,
2528
+ "epoch": 3.808,
2529
+ "grad_norm": 1.2308194637298584,
2530
+ "learning_rate": 2.3936e-05,
2531
+ "loss": 0.2328,
2532
+ "mean_token_accuracy": 0.929581755027175,
2533
+ "num_tokens": 932975.0,
2534
+ "step": 2380
2535
+ },
2536
+ {
2537
+ "entropy": 0.2381771973334253,
2538
+ "epoch": 3.824,
2539
+ "grad_norm": 0.7983541488647461,
2540
+ "learning_rate": 2.3616000000000002e-05,
2541
+ "loss": 0.2177,
2542
+ "mean_token_accuracy": 0.9316004611551761,
2543
+ "num_tokens": 965221.0,
2544
+ "step": 2390
2545
+ },
2546
+ {
2547
+ "entropy": 0.2579630766995251,
2548
+ "epoch": 3.84,
2549
+ "grad_norm": 0.8867554068565369,
2550
+ "learning_rate": 2.3296000000000002e-05,
2551
+ "loss": 0.2221,
2552
+ "mean_token_accuracy": 0.9320516049861908,
2553
+ "num_tokens": 990859.0,
2554
+ "step": 2400
2555
+ },
2556
+ {
2557
+ "epoch": 3.84,
2558
+ "eval_accuracy": 0.02676376698545462,
2559
+ "eval_entropy": 0.3534155045747757,
2560
+ "eval_loss": 0.6058897972106934,
2561
+ "eval_mean_token_accuracy": 0.8553497910499572,
2562
+ "eval_num_tokens": 990859.0,
2563
+ "eval_runtime": 869.2088,
2564
+ "eval_samples_per_second": 2.301,
2565
+ "eval_steps_per_second": 0.575,
2566
+ "step": 2400
2567
  }
2568
  ],
2569
  "logging_steps": 10,
 
2583
  "attributes": {}
2584
  }
2585
  },
2586
+ "total_flos": 4.143800723056128e+17,
2587
  "train_batch_size": 1,
2588
  "trial_name": null,
2589
  "trial_params": null