mtzig commited on
Commit
80fd6e0
·
verified ·
1 Parent(s): 137f25d

Training in progress, step 400, checkpoint

Browse files
last-checkpoint/optimizer_0/.metadata CHANGED
Binary files a/last-checkpoint/optimizer_0/.metadata and b/last-checkpoint/optimizer_0/.metadata differ
 
last-checkpoint/optimizer_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7e380ae04a5173aeaed71f5a23293af6a6b5ce9b37a1646c0f6027f825d779fc
3
  size 13934748
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7cd061667d62cdf6b8e885ae672a4fa1817a64172c0dc13f261537a3e31f28db
3
  size 13934748
last-checkpoint/optimizer_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4b043b19da04e16af34cc8400633335c68ab02712105f1221be29791b7d4e409
3
  size 13999412
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:87e886c2d84e224bcf754622b2803a8b4b64da30bce7eb4e4a3fb75b1b091c5e
3
  size 13999412
last-checkpoint/optimizer_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7f7f2a92d7df0f41408c607126dde2ec742d9311ee46369d1b8e81e62ba64c29
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d01731a1af4d71978b316124603e4caa090cc86ccd121d20f40ef90314e39721
3
  size 13990904
last-checkpoint/optimizer_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5ddb9986bf6ad380f520fbe804799f709d80c796ef3ff88134b3291fe5611761
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6befdc931c99a6a9572bf364f4fbf3a16a16ac047bda664b290f7eaf2d6f0509
3
  size 13990904
last-checkpoint/pytorch_model_fsdp_0/.metadata CHANGED
Binary files a/last-checkpoint/pytorch_model_fsdp_0/.metadata and b/last-checkpoint/pytorch_model_fsdp_0/.metadata differ
 
last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9a5ac5a37a7f3a37a7f5328e215de1663f8e85b03df885c4f3a38576bbb58b65
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d08e96af21e8b93e1cae1c1f298c74bd5cb903e59a95e666fe5d23d7c34e828
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a9c10492278bc53059b6ed6f765490ebda8641ddb2ca6422c5a3ff08f7b12216
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:800dcee3d49bf7c4fb9af44a7247d8c8a98f39fbe21de15901e57a24fee6d511
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3aac0a2ddbb2e0439e67de80ba07301bbb4f6fae538d608784bb99a990eb4374
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:95e1a7c487043377d57b4e529a8c41b121f1a82a2bf5513187f81cd357b2a6fd
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:70a08e5416ead83a8ca5a4737f339d26abe014328af01895f5dc9b9056c94042
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0cc70fc94ad214460b8f53afbc67815e264058229327612b212b333c955747d2
3
  size 6966784
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f53af7b36bcb1f4a3830f3094a6baae96f54d6751f1aca6ab0241469f55b4c77
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ffe6f9ada3514f92495fec3edd9e5bfa7e16527e9f4d407a243ffca3a335369
3
  size 15024
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b93e300fca3864a3b00ebf8f20bb271d3ee6a1118129c64855b165724ec8737a
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea26ba6138daf1586403be19f69bfdf220f2970f3306409052e7562dbee71e8f
3
  size 15024
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3aeed4b8b1f8111068fdf649eef309274cafb5724b7079e7c1ab8b7d24799ae1
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:350fe26b744c676e14aee27774ebc4bcf6a2961db0854ea02d257bc061e2b80c
3
  size 15024
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4bebf1ac8b6d22b64f12ee5515472ef4631edd26eaeae162231d65e567dd578f
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8aa0e29a92b309693c410bb08006a182de233c5acd31c16b7450cbb9c31feeb
3
  size 15024
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ba5d98c98ae03b619b5cc816786d7328ffd6502c6e3927d2220789c3367ca675
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f4d7d51569641046d070ffaf530561887033fff68178c32329f5f5841b1a076
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.37656903765690375,
5
  "eval_steps": 20,
6
- "global_step": 300,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -2299,6 +2299,766 @@
2299
  "eval_samples_per_second": 5.258,
2300
  "eval_steps_per_second": 0.171,
2301
  "step": 300
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2302
  }
2303
  ],
2304
  "logging_steps": 1,
@@ -2318,7 +3078,7 @@
2318
  "attributes": {}
2319
  }
2320
  },
2321
- "total_flos": 1.0951134131571917e+17,
2322
  "train_batch_size": 6,
2323
  "trial_name": null,
2324
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.502092050209205,
5
  "eval_steps": 20,
6
+ "global_step": 400,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
2299
  "eval_samples_per_second": 5.258,
2300
  "eval_steps_per_second": 0.171,
2301
  "step": 300
2302
+ },
2303
+ {
2304
+ "epoch": 0.3778242677824268,
2305
+ "grad_norm": 7.539090156555176,
2306
+ "learning_rate": 1.565562202732211e-05,
2307
+ "loss": 0.289,
2308
+ "step": 301
2309
+ },
2310
+ {
2311
+ "epoch": 0.3790794979079498,
2312
+ "grad_norm": 7.3726420402526855,
2313
+ "learning_rate": 1.561938209963753e-05,
2314
+ "loss": 0.2752,
2315
+ "step": 302
2316
+ },
2317
+ {
2318
+ "epoch": 0.3803347280334728,
2319
+ "grad_norm": 5.038547515869141,
2320
+ "learning_rate": 1.5583033988318453e-05,
2321
+ "loss": 0.2419,
2322
+ "step": 303
2323
+ },
2324
+ {
2325
+ "epoch": 0.3815899581589958,
2326
+ "grad_norm": 3.0914595127105713,
2327
+ "learning_rate": 1.554657839313413e-05,
2328
+ "loss": 0.2324,
2329
+ "step": 304
2330
+ },
2331
+ {
2332
+ "epoch": 0.38284518828451886,
2333
+ "grad_norm": 5.068948268890381,
2334
+ "learning_rate": 1.5510016015923084e-05,
2335
+ "loss": 0.2864,
2336
+ "step": 305
2337
+ },
2338
+ {
2339
+ "epoch": 0.38410041841004183,
2340
+ "grad_norm": 4.331803321838379,
2341
+ "learning_rate": 1.5473347560579576e-05,
2342
+ "loss": 0.2247,
2343
+ "step": 306
2344
+ },
2345
+ {
2346
+ "epoch": 0.38535564853556487,
2347
+ "grad_norm": 4.25094747543335,
2348
+ "learning_rate": 1.5436573733040073e-05,
2349
+ "loss": 0.2025,
2350
+ "step": 307
2351
+ },
2352
+ {
2353
+ "epoch": 0.38661087866108784,
2354
+ "grad_norm": 6.317193984985352,
2355
+ "learning_rate": 1.539969524126967e-05,
2356
+ "loss": 0.2389,
2357
+ "step": 308
2358
+ },
2359
+ {
2360
+ "epoch": 0.3878661087866109,
2361
+ "grad_norm": 5.176138401031494,
2362
+ "learning_rate": 1.5362712795248423e-05,
2363
+ "loss": 0.2235,
2364
+ "step": 309
2365
+ },
2366
+ {
2367
+ "epoch": 0.3891213389121339,
2368
+ "grad_norm": 4.67032527923584,
2369
+ "learning_rate": 1.5325627106957715e-05,
2370
+ "loss": 0.2004,
2371
+ "step": 310
2372
+ },
2373
+ {
2374
+ "epoch": 0.3903765690376569,
2375
+ "grad_norm": 7.408180236816406,
2376
+ "learning_rate": 1.5288438890366534e-05,
2377
+ "loss": 0.3133,
2378
+ "step": 311
2379
+ },
2380
+ {
2381
+ "epoch": 0.3916317991631799,
2382
+ "grad_norm": 4.369890213012695,
2383
+ "learning_rate": 1.5251148861417733e-05,
2384
+ "loss": 0.2798,
2385
+ "step": 312
2386
+ },
2387
+ {
2388
+ "epoch": 0.39288702928870295,
2389
+ "grad_norm": 6.916268348693848,
2390
+ "learning_rate": 1.5213757738014234e-05,
2391
+ "loss": 0.2518,
2392
+ "step": 313
2393
+ },
2394
+ {
2395
+ "epoch": 0.3941422594142259,
2396
+ "grad_norm": 3.2595841884613037,
2397
+ "learning_rate": 1.5176266240005225e-05,
2398
+ "loss": 0.2666,
2399
+ "step": 314
2400
+ },
2401
+ {
2402
+ "epoch": 0.39539748953974896,
2403
+ "grad_norm": 4.970115661621094,
2404
+ "learning_rate": 1.513867508917229e-05,
2405
+ "loss": 0.2762,
2406
+ "step": 315
2407
+ },
2408
+ {
2409
+ "epoch": 0.396652719665272,
2410
+ "grad_norm": 3.959069013595581,
2411
+ "learning_rate": 1.5100985009215519e-05,
2412
+ "loss": 0.2324,
2413
+ "step": 316
2414
+ },
2415
+ {
2416
+ "epoch": 0.39790794979079497,
2417
+ "grad_norm": 5.496798515319824,
2418
+ "learning_rate": 1.5063196725739568e-05,
2419
+ "loss": 0.283,
2420
+ "step": 317
2421
+ },
2422
+ {
2423
+ "epoch": 0.399163179916318,
2424
+ "grad_norm": 4.346258640289307,
2425
+ "learning_rate": 1.5025310966239701e-05,
2426
+ "loss": 0.2182,
2427
+ "step": 318
2428
+ },
2429
+ {
2430
+ "epoch": 0.400418410041841,
2431
+ "grad_norm": 7.267153263092041,
2432
+ "learning_rate": 1.4987328460087778e-05,
2433
+ "loss": 0.2261,
2434
+ "step": 319
2435
+ },
2436
+ {
2437
+ "epoch": 0.401673640167364,
2438
+ "grad_norm": 4.095457077026367,
2439
+ "learning_rate": 1.4949249938518203e-05,
2440
+ "loss": 0.2597,
2441
+ "step": 320
2442
+ },
2443
+ {
2444
+ "epoch": 0.401673640167364,
2445
+ "eval_accuracy": 0.8509933774834437,
2446
+ "eval_f1": 0.6867749419953596,
2447
+ "eval_loss": 0.31619083881378174,
2448
+ "eval_precision": 0.8361581920903954,
2449
+ "eval_recall": 0.5826771653543307,
2450
+ "eval_runtime": 50.8111,
2451
+ "eval_samples_per_second": 5.452,
2452
+ "eval_steps_per_second": 0.177,
2453
+ "step": 320
2454
+ },
2455
+ {
2456
+ "epoch": 0.40292887029288704,
2457
+ "grad_norm": 4.564698696136475,
2458
+ "learning_rate": 1.491107613461387e-05,
2459
+ "loss": 0.2494,
2460
+ "step": 321
2461
+ },
2462
+ {
2463
+ "epoch": 0.40418410041841,
2464
+ "grad_norm": 3.54681134223938,
2465
+ "learning_rate": 1.4872807783292027e-05,
2466
+ "loss": 0.2396,
2467
+ "step": 322
2468
+ },
2469
+ {
2470
+ "epoch": 0.40543933054393305,
2471
+ "grad_norm": 3.487334966659546,
2472
+ "learning_rate": 1.4834445621290144e-05,
2473
+ "loss": 0.2264,
2474
+ "step": 323
2475
+ },
2476
+ {
2477
+ "epoch": 0.4066945606694561,
2478
+ "grad_norm": 4.941503047943115,
2479
+ "learning_rate": 1.4795990387151719e-05,
2480
+ "loss": 0.2566,
2481
+ "step": 324
2482
+ },
2483
+ {
2484
+ "epoch": 0.40794979079497906,
2485
+ "grad_norm": 3.7651941776275635,
2486
+ "learning_rate": 1.4757442821212058e-05,
2487
+ "loss": 0.2159,
2488
+ "step": 325
2489
+ },
2490
+ {
2491
+ "epoch": 0.4092050209205021,
2492
+ "grad_norm": 6.6421685218811035,
2493
+ "learning_rate": 1.4718803665584038e-05,
2494
+ "loss": 0.2367,
2495
+ "step": 326
2496
+ },
2497
+ {
2498
+ "epoch": 0.4104602510460251,
2499
+ "grad_norm": 4.226874351501465,
2500
+ "learning_rate": 1.4680073664143799e-05,
2501
+ "loss": 0.2573,
2502
+ "step": 327
2503
+ },
2504
+ {
2505
+ "epoch": 0.4117154811715481,
2506
+ "grad_norm": 5.6968536376953125,
2507
+ "learning_rate": 1.464125356251644e-05,
2508
+ "loss": 0.3498,
2509
+ "step": 328
2510
+ },
2511
+ {
2512
+ "epoch": 0.41297071129707114,
2513
+ "grad_norm": 5.091569900512695,
2514
+ "learning_rate": 1.4602344108061657e-05,
2515
+ "loss": 0.2999,
2516
+ "step": 329
2517
+ },
2518
+ {
2519
+ "epoch": 0.41422594142259417,
2520
+ "grad_norm": 3.757646083831787,
2521
+ "learning_rate": 1.4563346049859348e-05,
2522
+ "loss": 0.2588,
2523
+ "step": 330
2524
+ },
2525
+ {
2526
+ "epoch": 0.41548117154811715,
2527
+ "grad_norm": 3.553725242614746,
2528
+ "learning_rate": 1.4524260138695206e-05,
2529
+ "loss": 0.3026,
2530
+ "step": 331
2531
+ },
2532
+ {
2533
+ "epoch": 0.4167364016736402,
2534
+ "grad_norm": 4.0715765953063965,
2535
+ "learning_rate": 1.4485087127046256e-05,
2536
+ "loss": 0.3188,
2537
+ "step": 332
2538
+ },
2539
+ {
2540
+ "epoch": 0.41799163179916315,
2541
+ "grad_norm": 3.9009945392608643,
2542
+ "learning_rate": 1.4445827769066374e-05,
2543
+ "loss": 0.2373,
2544
+ "step": 333
2545
+ },
2546
+ {
2547
+ "epoch": 0.4192468619246862,
2548
+ "grad_norm": 3.4119412899017334,
2549
+ "learning_rate": 1.4406482820571759e-05,
2550
+ "loss": 0.2381,
2551
+ "step": 334
2552
+ },
2553
+ {
2554
+ "epoch": 0.4205020920502092,
2555
+ "grad_norm": 7.349539756774902,
2556
+ "learning_rate": 1.4367053039026392e-05,
2557
+ "loss": 0.2528,
2558
+ "step": 335
2559
+ },
2560
+ {
2561
+ "epoch": 0.4217573221757322,
2562
+ "grad_norm": 3.9228568077087402,
2563
+ "learning_rate": 1.4327539183527447e-05,
2564
+ "loss": 0.249,
2565
+ "step": 336
2566
+ },
2567
+ {
2568
+ "epoch": 0.42301255230125523,
2569
+ "grad_norm": 5.134557247161865,
2570
+ "learning_rate": 1.4287942014790677e-05,
2571
+ "loss": 0.2908,
2572
+ "step": 337
2573
+ },
2574
+ {
2575
+ "epoch": 0.42426778242677826,
2576
+ "grad_norm": 4.12131929397583,
2577
+ "learning_rate": 1.4248262295135779e-05,
2578
+ "loss": 0.2661,
2579
+ "step": 338
2580
+ },
2581
+ {
2582
+ "epoch": 0.42552301255230124,
2583
+ "grad_norm": 3.757857322692871,
2584
+ "learning_rate": 1.42085007884717e-05,
2585
+ "loss": 0.2448,
2586
+ "step": 339
2587
+ },
2588
+ {
2589
+ "epoch": 0.42677824267782427,
2590
+ "grad_norm": 3.9377548694610596,
2591
+ "learning_rate": 1.4168658260281944e-05,
2592
+ "loss": 0.287,
2593
+ "step": 340
2594
+ },
2595
+ {
2596
+ "epoch": 0.42677824267782427,
2597
+ "eval_accuracy": 0.8532008830022075,
2598
+ "eval_f1": 0.70509977827051,
2599
+ "eval_loss": 0.29967617988586426,
2600
+ "eval_precision": 0.8071065989847716,
2601
+ "eval_recall": 0.6259842519685039,
2602
+ "eval_runtime": 50.9044,
2603
+ "eval_samples_per_second": 5.442,
2604
+ "eval_steps_per_second": 0.177,
2605
+ "step": 340
2606
+ },
2607
+ {
2608
+ "epoch": 0.4280334728033473,
2609
+ "grad_norm": 3.8355214595794678,
2610
+ "learning_rate": 1.4128735477609839e-05,
2611
+ "loss": 0.2409,
2612
+ "step": 341
2613
+ },
2614
+ {
2615
+ "epoch": 0.4292887029288703,
2616
+ "grad_norm": 3.9827072620391846,
2617
+ "learning_rate": 1.4088733209043748e-05,
2618
+ "loss": 0.1978,
2619
+ "step": 342
2620
+ },
2621
+ {
2622
+ "epoch": 0.4305439330543933,
2623
+ "grad_norm": 3.053262710571289,
2624
+ "learning_rate": 1.4048652224702295e-05,
2625
+ "loss": 0.215,
2626
+ "step": 343
2627
+ },
2628
+ {
2629
+ "epoch": 0.43179916317991635,
2630
+ "grad_norm": 3.117565155029297,
2631
+ "learning_rate": 1.400849329621953e-05,
2632
+ "loss": 0.2652,
2633
+ "step": 344
2634
+ },
2635
+ {
2636
+ "epoch": 0.4330543933054393,
2637
+ "grad_norm": 4.665426731109619,
2638
+ "learning_rate": 1.3968257196730069e-05,
2639
+ "loss": 0.3002,
2640
+ "step": 345
2641
+ },
2642
+ {
2643
+ "epoch": 0.43430962343096235,
2644
+ "grad_norm": 3.6823060512542725,
2645
+ "learning_rate": 1.3927944700854223e-05,
2646
+ "loss": 0.2987,
2647
+ "step": 346
2648
+ },
2649
+ {
2650
+ "epoch": 0.43556485355648533,
2651
+ "grad_norm": 3.018756628036499,
2652
+ "learning_rate": 1.388755658468307e-05,
2653
+ "loss": 0.2399,
2654
+ "step": 347
2655
+ },
2656
+ {
2657
+ "epoch": 0.43682008368200836,
2658
+ "grad_norm": 2.53790283203125,
2659
+ "learning_rate": 1.3847093625763517e-05,
2660
+ "loss": 0.2733,
2661
+ "step": 348
2662
+ },
2663
+ {
2664
+ "epoch": 0.4380753138075314,
2665
+ "grad_norm": 4.417150974273682,
2666
+ "learning_rate": 1.3806556603083346e-05,
2667
+ "loss": 0.2144,
2668
+ "step": 349
2669
+ },
2670
+ {
2671
+ "epoch": 0.4393305439330544,
2672
+ "grad_norm": 6.118602275848389,
2673
+ "learning_rate": 1.3765946297056192e-05,
2674
+ "loss": 0.3063,
2675
+ "step": 350
2676
+ },
2677
+ {
2678
+ "epoch": 0.4405857740585774,
2679
+ "grad_norm": 3.5751051902770996,
2680
+ "learning_rate": 1.3725263489506542e-05,
2681
+ "loss": 0.1951,
2682
+ "step": 351
2683
+ },
2684
+ {
2685
+ "epoch": 0.44184100418410044,
2686
+ "grad_norm": 5.6558837890625,
2687
+ "learning_rate": 1.3684508963654667e-05,
2688
+ "loss": 0.3366,
2689
+ "step": 352
2690
+ },
2691
+ {
2692
+ "epoch": 0.4430962343096234,
2693
+ "grad_norm": 3.0790345668792725,
2694
+ "learning_rate": 1.364368350410155e-05,
2695
+ "loss": 0.2517,
2696
+ "step": 353
2697
+ },
2698
+ {
2699
+ "epoch": 0.44435146443514645,
2700
+ "grad_norm": 3.3675646781921387,
2701
+ "learning_rate": 1.3602787896813787e-05,
2702
+ "loss": 0.283,
2703
+ "step": 354
2704
+ },
2705
+ {
2706
+ "epoch": 0.4456066945606695,
2707
+ "grad_norm": 3.162820339202881,
2708
+ "learning_rate": 1.356182292910844e-05,
2709
+ "loss": 0.2131,
2710
+ "step": 355
2711
+ },
2712
+ {
2713
+ "epoch": 0.44686192468619246,
2714
+ "grad_norm": 2.9676196575164795,
2715
+ "learning_rate": 1.3520789389637898e-05,
2716
+ "loss": 0.2782,
2717
+ "step": 356
2718
+ },
2719
+ {
2720
+ "epoch": 0.4481171548117155,
2721
+ "grad_norm": 5.9504008293151855,
2722
+ "learning_rate": 1.347968806837468e-05,
2723
+ "loss": 0.2663,
2724
+ "step": 357
2725
+ },
2726
+ {
2727
+ "epoch": 0.44937238493723847,
2728
+ "grad_norm": 5.749334335327148,
2729
+ "learning_rate": 1.3438519756596226e-05,
2730
+ "loss": 0.2307,
2731
+ "step": 358
2732
+ },
2733
+ {
2734
+ "epoch": 0.4506276150627615,
2735
+ "grad_norm": 5.305976390838623,
2736
+ "learning_rate": 1.339728524686968e-05,
2737
+ "loss": 0.2,
2738
+ "step": 359
2739
+ },
2740
+ {
2741
+ "epoch": 0.45188284518828453,
2742
+ "grad_norm": 5.051678657531738,
2743
+ "learning_rate": 1.335598533303662e-05,
2744
+ "loss": 0.3115,
2745
+ "step": 360
2746
+ },
2747
+ {
2748
+ "epoch": 0.45188284518828453,
2749
+ "eval_accuracy": 0.8543046357615894,
2750
+ "eval_f1": 0.6986301369863014,
2751
+ "eval_loss": 0.30281126499176025,
2752
+ "eval_precision": 0.8315217391304348,
2753
+ "eval_recall": 0.6023622047244095,
2754
+ "eval_runtime": 53.4812,
2755
+ "eval_samples_per_second": 5.179,
2756
+ "eval_steps_per_second": 0.168,
2757
+ "step": 360
2758
+ },
2759
+ {
2760
+ "epoch": 0.4531380753138075,
2761
+ "grad_norm": 3.419318914413452,
2762
+ "learning_rate": 1.331462081019776e-05,
2763
+ "loss": 0.2384,
2764
+ "step": 361
2765
+ },
2766
+ {
2767
+ "epoch": 0.45439330543933054,
2768
+ "grad_norm": 3.9998960494995117,
2769
+ "learning_rate": 1.327319247469768e-05,
2770
+ "loss": 0.2815,
2771
+ "step": 362
2772
+ },
2773
+ {
2774
+ "epoch": 0.4556485355648536,
2775
+ "grad_norm": 3.4446206092834473,
2776
+ "learning_rate": 1.323170112410946e-05,
2777
+ "loss": 0.272,
2778
+ "step": 363
2779
+ },
2780
+ {
2781
+ "epoch": 0.45690376569037655,
2782
+ "grad_norm": 3.6744120121002197,
2783
+ "learning_rate": 1.319014755721934e-05,
2784
+ "loss": 0.2609,
2785
+ "step": 364
2786
+ },
2787
+ {
2788
+ "epoch": 0.4581589958158996,
2789
+ "grad_norm": 4.846432209014893,
2790
+ "learning_rate": 1.3148532574011342e-05,
2791
+ "loss": 0.288,
2792
+ "step": 365
2793
+ },
2794
+ {
2795
+ "epoch": 0.4594142259414226,
2796
+ "grad_norm": 5.032169818878174,
2797
+ "learning_rate": 1.3106856975651866e-05,
2798
+ "loss": 0.2614,
2799
+ "step": 366
2800
+ },
2801
+ {
2802
+ "epoch": 0.4606694560669456,
2803
+ "grad_norm": 3.7232418060302734,
2804
+ "learning_rate": 1.3065121564474268e-05,
2805
+ "loss": 0.2498,
2806
+ "step": 367
2807
+ },
2808
+ {
2809
+ "epoch": 0.4619246861924686,
2810
+ "grad_norm": 3.13726544380188,
2811
+ "learning_rate": 1.3023327143963415e-05,
2812
+ "loss": 0.2192,
2813
+ "step": 368
2814
+ },
2815
+ {
2816
+ "epoch": 0.46317991631799166,
2817
+ "grad_norm": 5.035037994384766,
2818
+ "learning_rate": 1.2981474518740217e-05,
2819
+ "loss": 0.2971,
2820
+ "step": 369
2821
+ },
2822
+ {
2823
+ "epoch": 0.46443514644351463,
2824
+ "grad_norm": 3.5766642093658447,
2825
+ "learning_rate": 1.293956449454612e-05,
2826
+ "loss": 0.3288,
2827
+ "step": 370
2828
+ },
2829
+ {
2830
+ "epoch": 0.46569037656903767,
2831
+ "grad_norm": 2.6294803619384766,
2832
+ "learning_rate": 1.2897597878227624e-05,
2833
+ "loss": 0.236,
2834
+ "step": 371
2835
+ },
2836
+ {
2837
+ "epoch": 0.46694560669456064,
2838
+ "grad_norm": 5.947935104370117,
2839
+ "learning_rate": 1.285557547772072e-05,
2840
+ "loss": 0.2743,
2841
+ "step": 372
2842
+ },
2843
+ {
2844
+ "epoch": 0.4682008368200837,
2845
+ "grad_norm": 2.6133997440338135,
2846
+ "learning_rate": 1.2813498102035357e-05,
2847
+ "loss": 0.243,
2848
+ "step": 373
2849
+ },
2850
+ {
2851
+ "epoch": 0.4694560669456067,
2852
+ "grad_norm": 3.549476146697998,
2853
+ "learning_rate": 1.2771366561239865e-05,
2854
+ "loss": 0.1827,
2855
+ "step": 374
2856
+ },
2857
+ {
2858
+ "epoch": 0.4707112970711297,
2859
+ "grad_norm": 4.550835609436035,
2860
+ "learning_rate": 1.2729181666445338e-05,
2861
+ "loss": 0.2061,
2862
+ "step": 375
2863
+ },
2864
+ {
2865
+ "epoch": 0.4719665271966527,
2866
+ "grad_norm": 4.819687843322754,
2867
+ "learning_rate": 1.2686944229790044e-05,
2868
+ "loss": 0.2638,
2869
+ "step": 376
2870
+ },
2871
+ {
2872
+ "epoch": 0.47322175732217575,
2873
+ "grad_norm": 3.6842753887176514,
2874
+ "learning_rate": 1.264465506442378e-05,
2875
+ "loss": 0.2583,
2876
+ "step": 377
2877
+ },
2878
+ {
2879
+ "epoch": 0.47447698744769873,
2880
+ "grad_norm": 7.268190860748291,
2881
+ "learning_rate": 1.2602314984492222e-05,
2882
+ "loss": 0.2948,
2883
+ "step": 378
2884
+ },
2885
+ {
2886
+ "epoch": 0.47573221757322176,
2887
+ "grad_norm": 4.938054084777832,
2888
+ "learning_rate": 1.2559924805121236e-05,
2889
+ "loss": 0.2874,
2890
+ "step": 379
2891
+ },
2892
+ {
2893
+ "epoch": 0.4769874476987448,
2894
+ "grad_norm": 8.232144355773926,
2895
+ "learning_rate": 1.2517485342401201e-05,
2896
+ "loss": 0.2654,
2897
+ "step": 380
2898
+ },
2899
+ {
2900
+ "epoch": 0.4769874476987448,
2901
+ "eval_accuracy": 0.8543046357615894,
2902
+ "eval_f1": 0.7013574660633484,
2903
+ "eval_loss": 0.3007500171661377,
2904
+ "eval_precision": 0.824468085106383,
2905
+ "eval_recall": 0.610236220472441,
2906
+ "eval_runtime": 52.7757,
2907
+ "eval_samples_per_second": 5.249,
2908
+ "eval_steps_per_second": 0.171,
2909
+ "step": 380
2910
+ },
2911
+ {
2912
+ "epoch": 0.47824267782426777,
2913
+ "grad_norm": 4.203466892242432,
2914
+ "learning_rate": 1.2474997413371294e-05,
2915
+ "loss": 0.1688,
2916
+ "step": 381
2917
+ },
2918
+ {
2919
+ "epoch": 0.4794979079497908,
2920
+ "grad_norm": 2.6223390102386475,
2921
+ "learning_rate": 1.2432461836003762e-05,
2922
+ "loss": 0.2678,
2923
+ "step": 382
2924
+ },
2925
+ {
2926
+ "epoch": 0.48075313807531384,
2927
+ "grad_norm": 3.429694414138794,
2928
+ "learning_rate": 1.238987942918817e-05,
2929
+ "loss": 0.2859,
2930
+ "step": 383
2931
+ },
2932
+ {
2933
+ "epoch": 0.4820083682008368,
2934
+ "grad_norm": 2.9112257957458496,
2935
+ "learning_rate": 1.2347251012715629e-05,
2936
+ "loss": 0.2242,
2937
+ "step": 384
2938
+ },
2939
+ {
2940
+ "epoch": 0.48326359832635984,
2941
+ "grad_norm": 4.1868896484375,
2942
+ "learning_rate": 1.2304577407263032e-05,
2943
+ "loss": 0.2995,
2944
+ "step": 385
2945
+ },
2946
+ {
2947
+ "epoch": 0.4845188284518828,
2948
+ "grad_norm": 3.6559159755706787,
2949
+ "learning_rate": 1.2261859434377245e-05,
2950
+ "loss": 0.2115,
2951
+ "step": 386
2952
+ },
2953
+ {
2954
+ "epoch": 0.48577405857740585,
2955
+ "grad_norm": 4.471072673797607,
2956
+ "learning_rate": 1.2219097916459284e-05,
2957
+ "loss": 0.2012,
2958
+ "step": 387
2959
+ },
2960
+ {
2961
+ "epoch": 0.4870292887029289,
2962
+ "grad_norm": 4.849166393280029,
2963
+ "learning_rate": 1.2176293676748494e-05,
2964
+ "loss": 0.2927,
2965
+ "step": 388
2966
+ },
2967
+ {
2968
+ "epoch": 0.48828451882845186,
2969
+ "grad_norm": 4.391753196716309,
2970
+ "learning_rate": 1.2133447539306689e-05,
2971
+ "loss": 0.3133,
2972
+ "step": 389
2973
+ },
2974
+ {
2975
+ "epoch": 0.4895397489539749,
2976
+ "grad_norm": 2.870288848876953,
2977
+ "learning_rate": 1.2090560329002294e-05,
2978
+ "loss": 0.212,
2979
+ "step": 390
2980
+ },
2981
+ {
2982
+ "epoch": 0.49079497907949793,
2983
+ "grad_norm": 5.226803302764893,
2984
+ "learning_rate": 1.2047632871494472e-05,
2985
+ "loss": 0.224,
2986
+ "step": 391
2987
+ },
2988
+ {
2989
+ "epoch": 0.4920502092050209,
2990
+ "grad_norm": 3.988142728805542,
2991
+ "learning_rate": 1.200466599321721e-05,
2992
+ "loss": 0.2408,
2993
+ "step": 392
2994
+ },
2995
+ {
2996
+ "epoch": 0.49330543933054394,
2997
+ "grad_norm": 3.6183176040649414,
2998
+ "learning_rate": 1.196166052136342e-05,
2999
+ "loss": 0.2265,
3000
+ "step": 393
3001
+ },
3002
+ {
3003
+ "epoch": 0.49456066945606697,
3004
+ "grad_norm": 4.849849700927734,
3005
+ "learning_rate": 1.1918617283869021e-05,
3006
+ "loss": 0.2457,
3007
+ "step": 394
3008
+ },
3009
+ {
3010
+ "epoch": 0.49581589958158995,
3011
+ "grad_norm": 4.868863105773926,
3012
+ "learning_rate": 1.1875537109396978e-05,
3013
+ "loss": 0.2463,
3014
+ "step": 395
3015
+ },
3016
+ {
3017
+ "epoch": 0.497071129707113,
3018
+ "grad_norm": 3.9498956203460693,
3019
+ "learning_rate": 1.1832420827321374e-05,
3020
+ "loss": 0.2655,
3021
+ "step": 396
3022
+ },
3023
+ {
3024
+ "epoch": 0.49832635983263596,
3025
+ "grad_norm": 4.637706756591797,
3026
+ "learning_rate": 1.1789269267711425e-05,
3027
+ "loss": 0.2025,
3028
+ "step": 397
3029
+ },
3030
+ {
3031
+ "epoch": 0.499581589958159,
3032
+ "grad_norm": 3.8872170448303223,
3033
+ "learning_rate": 1.1746083261315505e-05,
3034
+ "loss": 0.2401,
3035
+ "step": 398
3036
+ },
3037
+ {
3038
+ "epoch": 0.500836820083682,
3039
+ "grad_norm": 3.0792970657348633,
3040
+ "learning_rate": 1.1702863639545157e-05,
3041
+ "loss": 0.2804,
3042
+ "step": 399
3043
+ },
3044
+ {
3045
+ "epoch": 0.502092050209205,
3046
+ "grad_norm": 5.217683792114258,
3047
+ "learning_rate": 1.165961123445908e-05,
3048
+ "loss": 0.2443,
3049
+ "step": 400
3050
+ },
3051
+ {
3052
+ "epoch": 0.502092050209205,
3053
+ "eval_accuracy": 0.8565121412803532,
3054
+ "eval_f1": 0.7161572052401747,
3055
+ "eval_loss": 0.2955167889595032,
3056
+ "eval_precision": 0.803921568627451,
3057
+ "eval_recall": 0.6456692913385826,
3058
+ "eval_runtime": 51.6679,
3059
+ "eval_samples_per_second": 5.361,
3060
+ "eval_steps_per_second": 0.174,
3061
+ "step": 400
3062
  }
3063
  ],
3064
  "logging_steps": 1,
 
3078
  "attributes": {}
3079
  }
3080
  },
3081
+ "total_flos": 1.4611649697467597e+17,
3082
  "train_batch_size": 6,
3083
  "trial_name": null,
3084
  "trial_params": null