ErrorAI commited on
Commit
069cf5b
·
verified ·
1 Parent(s): dd25d99

Training in progress, step 440, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:622986e1420dc3e7ee7885ea8da70861d0cf96bdea007127fbd180958f052a07
3
  size 36981072
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:13cce9bb0e996351713314d3f8524bbdb49c9ae98bbc5d38e88243b1df518f08
3
  size 36981072
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:935c98ae461b3b8ed892a66114875574ebb0e183dd718956b230f0941978eab4
3
  size 19859524
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb676f03323a195f09bb3fb1a56057c8e39c831ecf332633428cccdf75a1efdb
3
  size 19859524
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9d71f7343cca5753750cd88e4abed19d43187403cccb3b5aa9f782268b78a61c
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e99cd4efdc8ae360618f7c31cdaf24ebec835e26008dd443222ea8c2ad794ce3
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:91f6f24b84240d20fbf3f3b9ae432352426752db5e8618bd928fe6a5ad410144
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3aa6d4cd0a1d119d88746df8d17b061da99249879d9cb64d05543ac4d112a2c5
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.7504263786242183,
5
  "eval_steps": 500,
6
- "global_step": 330,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -2317,6 +2317,784 @@
2317
  "learning_rate": 1.4965269896332885e-05,
2318
  "loss": 0.6016,
2319
  "step": 330
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2320
  }
2321
  ],
2322
  "logging_steps": 1,
@@ -2331,12 +3109,12 @@
2331
  "should_evaluate": false,
2332
  "should_log": false,
2333
  "should_save": true,
2334
- "should_training_stop": false
2335
  },
2336
  "attributes": {}
2337
  }
2338
  },
2339
- "total_flos": 7.410215207043072e+16,
2340
  "train_batch_size": 4,
2341
  "trial_name": null,
2342
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.0017055144968732,
5
  "eval_steps": 500,
6
+ "global_step": 440,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
2317
  "learning_rate": 1.4965269896332885e-05,
2318
  "loss": 0.6016,
2319
  "step": 330
2320
+ },
2321
+ {
2322
+ "epoch": 0.7527003979533826,
2323
+ "grad_norm": 3.569581985473633,
2324
+ "learning_rate": 1.4708553057981355e-05,
2325
+ "loss": 0.4036,
2326
+ "step": 331
2327
+ },
2328
+ {
2329
+ "epoch": 0.7549744172825469,
2330
+ "grad_norm": 3.5513603687286377,
2331
+ "learning_rate": 1.4453676944196476e-05,
2332
+ "loss": 0.3642,
2333
+ "step": 332
2334
+ },
2335
+ {
2336
+ "epoch": 0.7572484366117112,
2337
+ "grad_norm": 3.9503798484802246,
2338
+ "learning_rate": 1.4200654848757994e-05,
2339
+ "loss": 0.3798,
2340
+ "step": 333
2341
+ },
2342
+ {
2343
+ "epoch": 0.7595224559408755,
2344
+ "grad_norm": 3.2657759189605713,
2345
+ "learning_rate": 1.3949499968744206e-05,
2346
+ "loss": 0.3445,
2347
+ "step": 334
2348
+ },
2349
+ {
2350
+ "epoch": 0.7617964752700398,
2351
+ "grad_norm": 1.5095345973968506,
2352
+ "learning_rate": 1.3700225403843469e-05,
2353
+ "loss": 0.1506,
2354
+ "step": 335
2355
+ },
2356
+ {
2357
+ "epoch": 0.7640704945992041,
2358
+ "grad_norm": 3.1756370067596436,
2359
+ "learning_rate": 1.3452844155671052e-05,
2360
+ "loss": 0.3452,
2361
+ "step": 336
2362
+ },
2363
+ {
2364
+ "epoch": 0.7663445139283684,
2365
+ "grad_norm": 2.36387038230896,
2366
+ "learning_rate": 1.3207369127090985e-05,
2367
+ "loss": 0.2018,
2368
+ "step": 337
2369
+ },
2370
+ {
2371
+ "epoch": 0.7686185332575327,
2372
+ "grad_norm": 1.7574656009674072,
2373
+ "learning_rate": 1.296381312154305e-05,
2374
+ "loss": 0.1288,
2375
+ "step": 338
2376
+ },
2377
+ {
2378
+ "epoch": 0.770892552586697,
2379
+ "grad_norm": 3.010063409805298,
2380
+ "learning_rate": 1.2722188842374966e-05,
2381
+ "loss": 0.2819,
2382
+ "step": 339
2383
+ },
2384
+ {
2385
+ "epoch": 0.7731665719158612,
2386
+ "grad_norm": 1.5432311296463013,
2387
+ "learning_rate": 1.2482508892179884e-05,
2388
+ "loss": 0.0887,
2389
+ "step": 340
2390
+ },
2391
+ {
2392
+ "epoch": 0.7754405912450256,
2393
+ "grad_norm": 2.749730110168457,
2394
+ "learning_rate": 1.2244785772138972e-05,
2395
+ "loss": 0.1669,
2396
+ "step": 341
2397
+ },
2398
+ {
2399
+ "epoch": 0.7777146105741899,
2400
+ "grad_norm": 4.618091583251953,
2401
+ "learning_rate": 1.2009031881369431e-05,
2402
+ "loss": 0.227,
2403
+ "step": 342
2404
+ },
2405
+ {
2406
+ "epoch": 0.7799886299033542,
2407
+ "grad_norm": 1.5743058919906616,
2408
+ "learning_rate": 1.177525951627781e-05,
2409
+ "loss": 0.1983,
2410
+ "step": 343
2411
+ },
2412
+ {
2413
+ "epoch": 0.7822626492325184,
2414
+ "grad_norm": 1.480060338973999,
2415
+ "learning_rate": 1.1543480869918555e-05,
2416
+ "loss": 0.0891,
2417
+ "step": 344
2418
+ },
2419
+ {
2420
+ "epoch": 0.7845366685616828,
2421
+ "grad_norm": 1.6388338804244995,
2422
+ "learning_rate": 1.1313708031358183e-05,
2423
+ "loss": 0.0913,
2424
+ "step": 345
2425
+ },
2426
+ {
2427
+ "epoch": 0.7868106878908471,
2428
+ "grad_norm": 0.9587397575378418,
2429
+ "learning_rate": 1.1085952985044634e-05,
2430
+ "loss": 0.041,
2431
+ "step": 346
2432
+ },
2433
+ {
2434
+ "epoch": 0.7890847072200113,
2435
+ "grad_norm": 1.376420497894287,
2436
+ "learning_rate": 1.0860227610182222e-05,
2437
+ "loss": 0.071,
2438
+ "step": 347
2439
+ },
2440
+ {
2441
+ "epoch": 0.7913587265491757,
2442
+ "grad_norm": 1.479590892791748,
2443
+ "learning_rate": 1.0636543680112044e-05,
2444
+ "loss": 0.0905,
2445
+ "step": 348
2446
+ },
2447
+ {
2448
+ "epoch": 0.79363274587834,
2449
+ "grad_norm": 1.486275553703308,
2450
+ "learning_rate": 1.04149128616979e-05,
2451
+ "loss": 0.1012,
2452
+ "step": 349
2453
+ },
2454
+ {
2455
+ "epoch": 0.7959067652075043,
2456
+ "grad_norm": 0.9402182698249817,
2457
+ "learning_rate": 1.0195346714717813e-05,
2458
+ "loss": 0.1013,
2459
+ "step": 350
2460
+ },
2461
+ {
2462
+ "epoch": 0.7981807845366685,
2463
+ "grad_norm": 4.111584186553955,
2464
+ "learning_rate": 9.977856691261057e-06,
2465
+ "loss": 0.6617,
2466
+ "step": 351
2467
+ },
2468
+ {
2469
+ "epoch": 0.8004548038658329,
2470
+ "grad_norm": 4.183617115020752,
2471
+ "learning_rate": 9.762454135130828e-06,
2472
+ "loss": 0.5664,
2473
+ "step": 352
2474
+ },
2475
+ {
2476
+ "epoch": 0.8027288231949972,
2477
+ "grad_norm": 3.7775447368621826,
2478
+ "learning_rate": 9.549150281252633e-06,
2479
+ "loss": 0.4659,
2480
+ "step": 353
2481
+ },
2482
+ {
2483
+ "epoch": 0.8050028425241614,
2484
+ "grad_norm": 3.4957637786865234,
2485
+ "learning_rate": 9.337956255088237e-06,
2486
+ "loss": 0.344,
2487
+ "step": 354
2488
+ },
2489
+ {
2490
+ "epoch": 0.8072768618533257,
2491
+ "grad_norm": 3.0935065746307373,
2492
+ "learning_rate": 9.12888307205541e-06,
2493
+ "loss": 0.3546,
2494
+ "step": 355
2495
+ },
2496
+ {
2497
+ "epoch": 0.8095508811824901,
2498
+ "grad_norm": 4.095314025878906,
2499
+ "learning_rate": 8.921941636953435e-06,
2500
+ "loss": 0.4984,
2501
+ "step": 356
2502
+ },
2503
+ {
2504
+ "epoch": 0.8118249005116543,
2505
+ "grad_norm": 3.417351007461548,
2506
+ "learning_rate": 8.717142743394236e-06,
2507
+ "loss": 0.3108,
2508
+ "step": 357
2509
+ },
2510
+ {
2511
+ "epoch": 0.8140989198408186,
2512
+ "grad_norm": 2.5139520168304443,
2513
+ "learning_rate": 8.514497073239491e-06,
2514
+ "loss": 0.3556,
2515
+ "step": 358
2516
+ },
2517
+ {
2518
+ "epoch": 0.816372939169983,
2519
+ "grad_norm": 4.973466873168945,
2520
+ "learning_rate": 8.3140151960435e-06,
2521
+ "loss": 0.5424,
2522
+ "step": 359
2523
+ },
2524
+ {
2525
+ "epoch": 0.8186469584991473,
2526
+ "grad_norm": 4.196943759918213,
2527
+ "learning_rate": 8.115707568501768e-06,
2528
+ "loss": 0.4552,
2529
+ "step": 360
2530
+ },
2531
+ {
2532
+ "epoch": 0.8209209778283115,
2533
+ "grad_norm": 4.142265796661377,
2534
+ "learning_rate": 7.919584533905777e-06,
2535
+ "loss": 0.3666,
2536
+ "step": 361
2537
+ },
2538
+ {
2539
+ "epoch": 0.8231949971574758,
2540
+ "grad_norm": 3.281536102294922,
2541
+ "learning_rate": 7.725656321603413e-06,
2542
+ "loss": 0.3572,
2543
+ "step": 362
2544
+ },
2545
+ {
2546
+ "epoch": 0.8254690164866402,
2547
+ "grad_norm": 5.057455539703369,
2548
+ "learning_rate": 7.533933046465419e-06,
2549
+ "loss": 0.5856,
2550
+ "step": 363
2551
+ },
2552
+ {
2553
+ "epoch": 0.8277430358158044,
2554
+ "grad_norm": 5.18012809753418,
2555
+ "learning_rate": 7.344424708357867e-06,
2556
+ "loss": 0.4198,
2557
+ "step": 364
2558
+ },
2559
+ {
2560
+ "epoch": 0.8300170551449687,
2561
+ "grad_norm": 2.427621841430664,
2562
+ "learning_rate": 7.157141191620548e-06,
2563
+ "loss": 0.1815,
2564
+ "step": 365
2565
+ },
2566
+ {
2567
+ "epoch": 0.832291074474133,
2568
+ "grad_norm": 0.5157700777053833,
2569
+ "learning_rate": 6.972092264551438e-06,
2570
+ "loss": 0.0464,
2571
+ "step": 366
2572
+ },
2573
+ {
2574
+ "epoch": 0.8345650938032974,
2575
+ "grad_norm": 0.9035636782646179,
2576
+ "learning_rate": 6.789287578897252e-06,
2577
+ "loss": 0.0749,
2578
+ "step": 367
2579
+ },
2580
+ {
2581
+ "epoch": 0.8368391131324616,
2582
+ "grad_norm": 0.434658020734787,
2583
+ "learning_rate": 6.6087366693499295e-06,
2584
+ "loss": 0.0233,
2585
+ "step": 368
2586
+ },
2587
+ {
2588
+ "epoch": 0.8391131324616259,
2589
+ "grad_norm": 0.6567199230194092,
2590
+ "learning_rate": 6.430448953049434e-06,
2591
+ "loss": 0.017,
2592
+ "step": 369
2593
+ },
2594
+ {
2595
+ "epoch": 0.8413871517907903,
2596
+ "grad_norm": 0.537125289440155,
2597
+ "learning_rate": 6.2544337290925185e-06,
2598
+ "loss": 0.0145,
2599
+ "step": 370
2600
+ },
2601
+ {
2602
+ "epoch": 0.8436611711199545,
2603
+ "grad_norm": 0.5989580750465393,
2604
+ "learning_rate": 6.080700178047688e-06,
2605
+ "loss": 0.0196,
2606
+ "step": 371
2607
+ },
2608
+ {
2609
+ "epoch": 0.8459351904491188,
2610
+ "grad_norm": 0.7875169515609741,
2611
+ "learning_rate": 5.909257361476405e-06,
2612
+ "loss": 0.0282,
2613
+ "step": 372
2614
+ },
2615
+ {
2616
+ "epoch": 0.8482092097782831,
2617
+ "grad_norm": 0.6391351819038391,
2618
+ "learning_rate": 5.740114221460424e-06,
2619
+ "loss": 0.0167,
2620
+ "step": 373
2621
+ },
2622
+ {
2623
+ "epoch": 0.8504832291074474,
2624
+ "grad_norm": 0.4413357377052307,
2625
+ "learning_rate": 5.573279580135438e-06,
2626
+ "loss": 0.0425,
2627
+ "step": 374
2628
+ },
2629
+ {
2630
+ "epoch": 0.8527572484366117,
2631
+ "grad_norm": 0.4121001958847046,
2632
+ "learning_rate": 5.408762139230888e-06,
2633
+ "loss": 0.0165,
2634
+ "step": 375
2635
+ },
2636
+ {
2637
+ "epoch": 0.855031267765776,
2638
+ "grad_norm": 0.6832776665687561,
2639
+ "learning_rate": 5.246570479616103e-06,
2640
+ "loss": 0.0272,
2641
+ "step": 376
2642
+ },
2643
+ {
2644
+ "epoch": 0.8573052870949404,
2645
+ "grad_norm": 3.090912342071533,
2646
+ "learning_rate": 5.086713060852788e-06,
2647
+ "loss": 0.176,
2648
+ "step": 377
2649
+ },
2650
+ {
2651
+ "epoch": 0.8595793064241046,
2652
+ "grad_norm": 2.9185822010040283,
2653
+ "learning_rate": 4.929198220753722e-06,
2654
+ "loss": 0.2692,
2655
+ "step": 378
2656
+ },
2657
+ {
2658
+ "epoch": 0.8618533257532689,
2659
+ "grad_norm": 5.094744682312012,
2660
+ "learning_rate": 4.774034174947922e-06,
2661
+ "loss": 0.4768,
2662
+ "step": 379
2663
+ },
2664
+ {
2665
+ "epoch": 0.8641273450824332,
2666
+ "grad_norm": 4.020772933959961,
2667
+ "learning_rate": 4.621229016452156e-06,
2668
+ "loss": 0.332,
2669
+ "step": 380
2670
+ },
2671
+ {
2672
+ "epoch": 0.8664013644115975,
2673
+ "grad_norm": 4.474671840667725,
2674
+ "learning_rate": 4.4707907152487405e-06,
2675
+ "loss": 0.3265,
2676
+ "step": 381
2677
+ },
2678
+ {
2679
+ "epoch": 0.8686753837407618,
2680
+ "grad_norm": 3.18890643119812,
2681
+ "learning_rate": 4.322727117869951e-06,
2682
+ "loss": 0.2424,
2683
+ "step": 382
2684
+ },
2685
+ {
2686
+ "epoch": 0.8709494030699261,
2687
+ "grad_norm": 4.199413299560547,
2688
+ "learning_rate": 4.1770459469887005e-06,
2689
+ "loss": 0.2787,
2690
+ "step": 383
2691
+ },
2692
+ {
2693
+ "epoch": 0.8732234223990903,
2694
+ "grad_norm": 2.4939513206481934,
2695
+ "learning_rate": 4.033754801015732e-06,
2696
+ "loss": 0.1588,
2697
+ "step": 384
2698
+ },
2699
+ {
2700
+ "epoch": 0.8754974417282547,
2701
+ "grad_norm": 3.3816030025482178,
2702
+ "learning_rate": 3.892861153703342e-06,
2703
+ "loss": 0.2011,
2704
+ "step": 385
2705
+ },
2706
+ {
2707
+ "epoch": 0.877771461057419,
2708
+ "grad_norm": 2.142763376235962,
2709
+ "learning_rate": 3.7543723537555585e-06,
2710
+ "loss": 0.1491,
2711
+ "step": 386
2712
+ },
2713
+ {
2714
+ "epoch": 0.8800454803865833,
2715
+ "grad_norm": 2.4426450729370117,
2716
+ "learning_rate": 3.6182956244448117e-06,
2717
+ "loss": 0.1654,
2718
+ "step": 387
2719
+ },
2720
+ {
2721
+ "epoch": 0.8823194997157476,
2722
+ "grad_norm": 1.4727312326431274,
2723
+ "learning_rate": 3.4846380632352458e-06,
2724
+ "loss": 0.0619,
2725
+ "step": 388
2726
+ },
2727
+ {
2728
+ "epoch": 0.8845935190449119,
2729
+ "grad_norm": 1.7243990898132324,
2730
+ "learning_rate": 3.35340664141246e-06,
2731
+ "loss": 0.0671,
2732
+ "step": 389
2733
+ },
2734
+ {
2735
+ "epoch": 0.8868675383740762,
2736
+ "grad_norm": 1.6698848009109497,
2737
+ "learning_rate": 3.2246082037199532e-06,
2738
+ "loss": 0.0581,
2739
+ "step": 390
2740
+ },
2741
+ {
2742
+ "epoch": 0.8891415577032404,
2743
+ "grad_norm": 2.760230302810669,
2744
+ "learning_rate": 3.0982494680021177e-06,
2745
+ "loss": 0.1919,
2746
+ "step": 391
2747
+ },
2748
+ {
2749
+ "epoch": 0.8914155770324048,
2750
+ "grad_norm": 4.160801410675049,
2751
+ "learning_rate": 2.9743370248538017e-06,
2752
+ "loss": 0.3573,
2753
+ "step": 392
2754
+ },
2755
+ {
2756
+ "epoch": 0.8936895963615691,
2757
+ "grad_norm": 1.2745391130447388,
2758
+ "learning_rate": 2.8528773372766216e-06,
2759
+ "loss": 0.0565,
2760
+ "step": 393
2761
+ },
2762
+ {
2763
+ "epoch": 0.8959636156907334,
2764
+ "grad_norm": 1.1233683824539185,
2765
+ "learning_rate": 2.7338767403418287e-06,
2766
+ "loss": 0.0371,
2767
+ "step": 394
2768
+ },
2769
+ {
2770
+ "epoch": 0.8982376350198976,
2771
+ "grad_norm": 1.4613378047943115,
2772
+ "learning_rate": 2.6173414408598827e-06,
2773
+ "loss": 0.0638,
2774
+ "step": 395
2775
+ },
2776
+ {
2777
+ "epoch": 0.900511654349062,
2778
+ "grad_norm": 1.0064411163330078,
2779
+ "learning_rate": 2.503277517056729e-06,
2780
+ "loss": 0.0226,
2781
+ "step": 396
2782
+ },
2783
+ {
2784
+ "epoch": 0.9027856736782263,
2785
+ "grad_norm": 1.1050655841827393,
2786
+ "learning_rate": 2.3916909182567782e-06,
2787
+ "loss": 0.0482,
2788
+ "step": 397
2789
+ },
2790
+ {
2791
+ "epoch": 0.9050596930073905,
2792
+ "grad_norm": 1.100659966468811,
2793
+ "learning_rate": 2.282587464572594e-06,
2794
+ "loss": 0.0531,
2795
+ "step": 398
2796
+ },
2797
+ {
2798
+ "epoch": 0.9073337123365549,
2799
+ "grad_norm": 0.8102996945381165,
2800
+ "learning_rate": 2.175972846601343e-06,
2801
+ "loss": 0.0399,
2802
+ "step": 399
2803
+ },
2804
+ {
2805
+ "epoch": 0.9096077316657192,
2806
+ "grad_norm": 2.706613779067993,
2807
+ "learning_rate": 2.0718526251279346e-06,
2808
+ "loss": 0.2256,
2809
+ "step": 400
2810
+ },
2811
+ {
2812
+ "epoch": 0.9118817509948834,
2813
+ "grad_norm": 4.959632396697998,
2814
+ "learning_rate": 1.9702322308350674e-06,
2815
+ "loss": 0.6795,
2816
+ "step": 401
2817
+ },
2818
+ {
2819
+ "epoch": 0.9141557703240477,
2820
+ "grad_norm": 4.623620510101318,
2821
+ "learning_rate": 1.8711169640198977e-06,
2822
+ "loss": 0.3954,
2823
+ "step": 402
2824
+ },
2825
+ {
2826
+ "epoch": 0.9164297896532121,
2827
+ "grad_norm": 3.527909517288208,
2828
+ "learning_rate": 1.774511994317629e-06,
2829
+ "loss": 0.4934,
2830
+ "step": 403
2831
+ },
2832
+ {
2833
+ "epoch": 0.9187038089823764,
2834
+ "grad_norm": 4.126560688018799,
2835
+ "learning_rate": 1.6804223604318825e-06,
2836
+ "loss": 0.2839,
2837
+ "step": 404
2838
+ },
2839
+ {
2840
+ "epoch": 0.9209778283115406,
2841
+ "grad_norm": 2.926298141479492,
2842
+ "learning_rate": 1.5888529698718346e-06,
2843
+ "loss": 0.2572,
2844
+ "step": 405
2845
+ },
2846
+ {
2847
+ "epoch": 0.9232518476407049,
2848
+ "grad_norm": 4.207761764526367,
2849
+ "learning_rate": 1.4998085986963283e-06,
2850
+ "loss": 0.4188,
2851
+ "step": 406
2852
+ },
2853
+ {
2854
+ "epoch": 0.9255258669698693,
2855
+ "grad_norm": 4.310345649719238,
2856
+ "learning_rate": 1.413293891264722e-06,
2857
+ "loss": 0.3598,
2858
+ "step": 407
2859
+ },
2860
+ {
2861
+ "epoch": 0.9277998862990335,
2862
+ "grad_norm": 2.979948043823242,
2863
+ "learning_rate": 1.3293133599946329e-06,
2864
+ "loss": 0.2512,
2865
+ "step": 408
2866
+ },
2867
+ {
2868
+ "epoch": 0.9300739056281978,
2869
+ "grad_norm": 2.9833507537841797,
2870
+ "learning_rate": 1.2478713851266088e-06,
2871
+ "loss": 0.305,
2872
+ "step": 409
2873
+ },
2874
+ {
2875
+ "epoch": 0.9323479249573622,
2876
+ "grad_norm": 3.2704060077667236,
2877
+ "learning_rate": 1.1689722144956671e-06,
2878
+ "loss": 0.431,
2879
+ "step": 410
2880
+ },
2881
+ {
2882
+ "epoch": 0.9346219442865265,
2883
+ "grad_norm": 3.7926714420318604,
2884
+ "learning_rate": 1.0926199633097157e-06,
2885
+ "loss": 0.3448,
2886
+ "step": 411
2887
+ },
2888
+ {
2889
+ "epoch": 0.9368959636156907,
2890
+ "grad_norm": 3.6115946769714355,
2891
+ "learning_rate": 1.0188186139349354e-06,
2892
+ "loss": 0.4043,
2893
+ "step": 412
2894
+ },
2895
+ {
2896
+ "epoch": 0.939169982944855,
2897
+ "grad_norm": 1.762302041053772,
2898
+ "learning_rate": 9.475720156880419e-07,
2899
+ "loss": 0.099,
2900
+ "step": 413
2901
+ },
2902
+ {
2903
+ "epoch": 0.9414440022740194,
2904
+ "grad_norm": 0.5029296278953552,
2905
+ "learning_rate": 8.788838846355341e-07,
2906
+ "loss": 0.0218,
2907
+ "step": 414
2908
+ },
2909
+ {
2910
+ "epoch": 0.9437180216031836,
2911
+ "grad_norm": 0.4769535958766937,
2912
+ "learning_rate": 8.127578033998662e-07,
2913
+ "loss": 0.0152,
2914
+ "step": 415
2915
+ },
2916
+ {
2917
+ "epoch": 0.9459920409323479,
2918
+ "grad_norm": 0.8436768651008606,
2919
+ "learning_rate": 7.491972209725806e-07,
2920
+ "loss": 0.0319,
2921
+ "step": 416
2922
+ },
2923
+ {
2924
+ "epoch": 0.9482660602615123,
2925
+ "grad_norm": 0.6843982934951782,
2926
+ "learning_rate": 6.88205452534435e-07,
2927
+ "loss": 0.038,
2928
+ "step": 417
2929
+ },
2930
+ {
2931
+ "epoch": 0.9505400795906765,
2932
+ "grad_norm": 0.6065624356269836,
2933
+ "learning_rate": 6.297856792824741e-07,
2934
+ "loss": 0.0266,
2935
+ "step": 418
2936
+ },
2937
+ {
2938
+ "epoch": 0.9528140989198408,
2939
+ "grad_norm": 1.784542202949524,
2940
+ "learning_rate": 5.739409482640956e-07,
2941
+ "loss": 0.041,
2942
+ "step": 419
2943
+ },
2944
+ {
2945
+ "epoch": 0.9550881182490051,
2946
+ "grad_norm": 0.4885413646697998,
2947
+ "learning_rate": 5.206741722181386e-07,
2948
+ "loss": 0.0198,
2949
+ "step": 420
2950
+ },
2951
+ {
2952
+ "epoch": 0.9573621375781695,
2953
+ "grad_norm": 0.47082552313804626,
2954
+ "learning_rate": 4.699881294229602e-07,
2955
+ "loss": 0.0176,
2956
+ "step": 421
2957
+ },
2958
+ {
2959
+ "epoch": 0.9596361569073337,
2960
+ "grad_norm": 0.7649410963058472,
2961
+ "learning_rate": 4.2188546355153013e-07,
2962
+ "loss": 0.0433,
2963
+ "step": 422
2964
+ },
2965
+ {
2966
+ "epoch": 0.961910176236498,
2967
+ "grad_norm": 2.893561840057373,
2968
+ "learning_rate": 3.763686835335345e-07,
2969
+ "loss": 0.2227,
2970
+ "step": 423
2971
+ },
2972
+ {
2973
+ "epoch": 0.9641841955656623,
2974
+ "grad_norm": 2.788997173309326,
2975
+ "learning_rate": 3.334401634245032e-07,
2976
+ "loss": 0.226,
2977
+ "step": 424
2978
+ },
2979
+ {
2980
+ "epoch": 0.9664582148948266,
2981
+ "grad_norm": 5.2662739753723145,
2982
+ "learning_rate": 2.9310214228202013e-07,
2983
+ "loss": 0.5212,
2984
+ "step": 425
2985
+ },
2986
+ {
2987
+ "epoch": 0.9687322342239909,
2988
+ "grad_norm": 2.4451019763946533,
2989
+ "learning_rate": 2.553567240489052e-07,
2990
+ "loss": 0.235,
2991
+ "step": 426
2992
+ },
2993
+ {
2994
+ "epoch": 0.9710062535531552,
2995
+ "grad_norm": 3.208662748336792,
2996
+ "learning_rate": 2.202058774434912e-07,
2997
+ "loss": 0.2381,
2998
+ "step": 427
2999
+ },
3000
+ {
3001
+ "epoch": 0.9732802728823196,
3002
+ "grad_norm": 3.0693438053131104,
3003
+ "learning_rate": 1.8765143585693922e-07,
3004
+ "loss": 0.2596,
3005
+ "step": 428
3006
+ },
3007
+ {
3008
+ "epoch": 0.9755542922114838,
3009
+ "grad_norm": 4.01469612121582,
3010
+ "learning_rate": 1.5769509725760966e-07,
3011
+ "loss": 0.2541,
3012
+ "step": 429
3013
+ },
3014
+ {
3015
+ "epoch": 0.9778283115406481,
3016
+ "grad_norm": 2.0251407623291016,
3017
+ "learning_rate": 1.3033842410251075e-07,
3018
+ "loss": 0.1398,
3019
+ "step": 430
3020
+ },
3021
+ {
3022
+ "epoch": 0.9801023308698124,
3023
+ "grad_norm": 1.4239128828048706,
3024
+ "learning_rate": 1.0558284325578038e-07,
3025
+ "loss": 0.0793,
3026
+ "step": 431
3027
+ },
3028
+ {
3029
+ "epoch": 0.9823763501989767,
3030
+ "grad_norm": 1.6692121028900146,
3031
+ "learning_rate": 8.342964591430136e-08,
3032
+ "loss": 0.1298,
3033
+ "step": 432
3034
+ },
3035
+ {
3036
+ "epoch": 0.984650369528141,
3037
+ "grad_norm": 1.8359993696212769,
3038
+ "learning_rate": 6.38799875403051e-08,
3039
+ "loss": 0.0778,
3040
+ "step": 433
3041
+ },
3042
+ {
3043
+ "epoch": 0.9869243888573053,
3044
+ "grad_norm": 1.6211203336715698,
3045
+ "learning_rate": 4.6934887801164396e-08,
3046
+ "loss": 0.148,
3047
+ "step": 434
3048
+ },
3049
+ {
3050
+ "epoch": 0.9891984081864695,
3051
+ "grad_norm": 0.7776852250099182,
3052
+ "learning_rate": 3.259523051615254e-08,
3053
+ "loss": 0.0437,
3054
+ "step": 435
3055
+ },
3056
+ {
3057
+ "epoch": 0.9914724275156339,
3058
+ "grad_norm": 0.6490265727043152,
3059
+ "learning_rate": 2.086176361038583e-08,
3060
+ "loss": 0.0175,
3061
+ "step": 436
3062
+ },
3063
+ {
3064
+ "epoch": 0.9937464468447982,
3065
+ "grad_norm": 1.8342341184616089,
3066
+ "learning_rate": 1.173509907579362e-08,
3067
+ "loss": 0.0702,
3068
+ "step": 437
3069
+ },
3070
+ {
3071
+ "epoch": 0.9960204661739624,
3072
+ "grad_norm": 1.4983391761779785,
3073
+ "learning_rate": 5.215712939210526e-09,
3074
+ "loss": 0.0609,
3075
+ "step": 438
3076
+ },
3077
+ {
3078
+ "epoch": 0.9982944855031268,
3079
+ "grad_norm": 0.8002070784568787,
3080
+ "learning_rate": 1.3039452375351868e-09,
3081
+ "loss": 0.0323,
3082
+ "step": 439
3083
+ },
3084
+ {
3085
+ "epoch": 0.9982944855031268,
3086
+ "eval_loss": 0.180589497089386,
3087
+ "eval_runtime": 13.3573,
3088
+ "eval_samples_per_second": 27.775,
3089
+ "eval_steps_per_second": 6.962,
3090
+ "step": 439
3091
+ },
3092
+ {
3093
+ "epoch": 1.0017055144968732,
3094
+ "grad_norm": 3.280466079711914,
3095
+ "learning_rate": 0.0,
3096
+ "loss": 0.4975,
3097
+ "step": 440
3098
  }
3099
  ],
3100
  "logging_steps": 1,
 
3109
  "should_evaluate": false,
3110
  "should_log": false,
3111
  "should_save": true,
3112
+ "should_training_stop": true
3113
  },
3114
  "attributes": {}
3115
  }
3116
  },
3117
+ "total_flos": 9.712732842491904e+16,
3118
  "train_batch_size": 4,
3119
  "trial_name": null,
3120
  "trial_params": null