ErrorAI commited on
Commit
999e810
·
verified ·
1 Parent(s): 7851115

Training in progress, step 459, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:278544bd33ac7e11892f69ff2a5b02528f47d3f92ae18af4060cbadb270e856a
3
  size 9823216
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a621e3f29008fb453ed2cf978e69a9ebdc967a23022e7b68588ddd7464e9f5f
3
  size 9823216
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:918743597de1f20d3c3fce1be5acfcdc1a4c2f295bf3ecd91451c47fd23acb0c
3
  size 5963308
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:272c9698c5a63826c16a28fcfeb1bf2cca0e6c50d6284f067339e1f8383f0536
3
  size 5963308
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:96327244f9b8a6fad89ea5ea1c33e519967f49cf892e8708e1eaff5e2454c806
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21b5efc09a44b04d8556fedaec1c570849e7b36226a500553357208dd76bb9f2
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0f8c76e1f73821e5dc5f19af66b43cce93c7368418083f7e44c06cc74ae4f057
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47355b5abe01a117c8c3acc65c7d1228e222f8fe09358b0c7a3a25456e8bba9b
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.7516339869281046,
5
  "eval_steps": 500,
6
- "global_step": 345,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -2422,6 +2422,804 @@
2422
  "learning_rate": 1.4767198386708997e-05,
2423
  "loss": 1.5286,
2424
  "step": 345
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2425
  }
2426
  ],
2427
  "logging_steps": 1,
@@ -2436,12 +3234,12 @@
2436
  "should_evaluate": false,
2437
  "should_log": false,
2438
  "should_save": true,
2439
- "should_training_stop": false
2440
  },
2441
  "attributes": {}
2442
  }
2443
  },
2444
- "total_flos": 3708736431980544.0,
2445
  "train_batch_size": 4,
2446
  "trial_name": null,
2447
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
  "eval_steps": 500,
6
+ "global_step": 459,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
2422
  "learning_rate": 1.4767198386708997e-05,
2423
  "loss": 1.5286,
2424
  "step": 345
2425
+ },
2426
+ {
2427
+ "epoch": 0.7538126361655774,
2428
+ "grad_norm": 0.4534967541694641,
2429
+ "learning_rate": 1.4522546730656129e-05,
2430
+ "loss": 1.4874,
2431
+ "step": 346
2432
+ },
2433
+ {
2434
+ "epoch": 0.7559912854030502,
2435
+ "grad_norm": 0.4711008369922638,
2436
+ "learning_rate": 1.427959386087761e-05,
2437
+ "loss": 1.5148,
2438
+ "step": 347
2439
+ },
2440
+ {
2441
+ "epoch": 0.7581699346405228,
2442
+ "grad_norm": 0.433192640542984,
2443
+ "learning_rate": 1.4038351410818434e-05,
2444
+ "loss": 1.6257,
2445
+ "step": 348
2446
+ },
2447
+ {
2448
+ "epoch": 0.7603485838779956,
2449
+ "grad_norm": 0.4659889042377472,
2450
+ "learning_rate": 1.3798830932022617e-05,
2451
+ "loss": 1.6029,
2452
+ "step": 349
2453
+ },
2454
+ {
2455
+ "epoch": 0.7625272331154684,
2456
+ "grad_norm": 0.5066768527030945,
2457
+ "learning_rate": 1.3561043893580083e-05,
2458
+ "loss": 1.6275,
2459
+ "step": 350
2460
+ },
2461
+ {
2462
+ "epoch": 0.7647058823529411,
2463
+ "grad_norm": 0.4900889992713928,
2464
+ "learning_rate": 1.3325001681577482e-05,
2465
+ "loss": 1.8466,
2466
+ "step": 351
2467
+ },
2468
+ {
2469
+ "epoch": 0.7668845315904139,
2470
+ "grad_norm": 0.39974841475486755,
2471
+ "learning_rate": 1.3090715598552999e-05,
2472
+ "loss": 1.6942,
2473
+ "step": 352
2474
+ },
2475
+ {
2476
+ "epoch": 0.7690631808278867,
2477
+ "grad_norm": 0.41923969984054565,
2478
+ "learning_rate": 1.2858196862955107e-05,
2479
+ "loss": 1.4645,
2480
+ "step": 353
2481
+ },
2482
+ {
2483
+ "epoch": 0.7712418300653595,
2484
+ "grad_norm": 0.39095959067344666,
2485
+ "learning_rate": 1.2627456608605443e-05,
2486
+ "loss": 1.6083,
2487
+ "step": 354
2488
+ },
2489
+ {
2490
+ "epoch": 0.7734204793028322,
2491
+ "grad_norm": 0.3984636068344116,
2492
+ "learning_rate": 1.239850588416565e-05,
2493
+ "loss": 1.6569,
2494
+ "step": 355
2495
+ },
2496
+ {
2497
+ "epoch": 0.775599128540305,
2498
+ "grad_norm": 0.4196769893169403,
2499
+ "learning_rate": 1.217135565260833e-05,
2500
+ "loss": 1.6212,
2501
+ "step": 356
2502
+ },
2503
+ {
2504
+ "epoch": 0.7777777777777778,
2505
+ "grad_norm": 0.4557774066925049,
2506
+ "learning_rate": 1.1946016790692094e-05,
2507
+ "loss": 1.6636,
2508
+ "step": 357
2509
+ },
2510
+ {
2511
+ "epoch": 0.7799564270152506,
2512
+ "grad_norm": 0.3991740942001343,
2513
+ "learning_rate": 1.1722500088440768e-05,
2514
+ "loss": 1.4824,
2515
+ "step": 358
2516
+ },
2517
+ {
2518
+ "epoch": 0.7821350762527233,
2519
+ "grad_norm": 0.44218507409095764,
2520
+ "learning_rate": 1.1500816248626712e-05,
2521
+ "loss": 1.5913,
2522
+ "step": 359
2523
+ },
2524
+ {
2525
+ "epoch": 0.7843137254901961,
2526
+ "grad_norm": 0.3721107542514801,
2527
+ "learning_rate": 1.1280975886258293e-05,
2528
+ "loss": 1.5063,
2529
+ "step": 360
2530
+ },
2531
+ {
2532
+ "epoch": 0.7864923747276689,
2533
+ "grad_norm": 0.5461967587471008,
2534
+ "learning_rate": 1.1062989528071682e-05,
2535
+ "loss": 1.7895,
2536
+ "step": 361
2537
+ },
2538
+ {
2539
+ "epoch": 0.7886710239651417,
2540
+ "grad_norm": 0.38676172494888306,
2541
+ "learning_rate": 1.0846867612026745e-05,
2542
+ "loss": 1.4693,
2543
+ "step": 362
2544
+ },
2545
+ {
2546
+ "epoch": 0.7908496732026143,
2547
+ "grad_norm": 0.45002123713493347,
2548
+ "learning_rate": 1.0632620486807243e-05,
2549
+ "loss": 1.5761,
2550
+ "step": 363
2551
+ },
2552
+ {
2553
+ "epoch": 0.7930283224400871,
2554
+ "grad_norm": 0.384592741727829,
2555
+ "learning_rate": 1.0420258411325307e-05,
2556
+ "loss": 1.4709,
2557
+ "step": 364
2558
+ },
2559
+ {
2560
+ "epoch": 0.7952069716775599,
2561
+ "grad_norm": 0.39452260732650757,
2562
+ "learning_rate": 1.0209791554230209e-05,
2563
+ "loss": 1.6416,
2564
+ "step": 365
2565
+ },
2566
+ {
2567
+ "epoch": 0.7973856209150327,
2568
+ "grad_norm": 0.4410095512866974,
2569
+ "learning_rate": 1.0001229993421412e-05,
2570
+ "loss": 1.6746,
2571
+ "step": 366
2572
+ },
2573
+ {
2574
+ "epoch": 0.7995642701525054,
2575
+ "grad_norm": 0.4333903193473816,
2576
+ "learning_rate": 9.79458371556607e-06,
2577
+ "loss": 1.5514,
2578
+ "step": 367
2579
+ },
2580
+ {
2581
+ "epoch": 0.8017429193899782,
2582
+ "grad_norm": 0.38147422671318054,
2583
+ "learning_rate": 9.589862615620782e-06,
2584
+ "loss": 1.5221,
2585
+ "step": 368
2586
+ },
2587
+ {
2588
+ "epoch": 0.803921568627451,
2589
+ "grad_norm": 0.4305866062641144,
2590
+ "learning_rate": 9.387076496357805e-06,
2591
+ "loss": 1.5458,
2592
+ "step": 369
2593
+ },
2594
+ {
2595
+ "epoch": 0.8061002178649237,
2596
+ "grad_norm": 0.42009007930755615,
2597
+ "learning_rate": 9.186235067895672e-06,
2598
+ "loss": 1.4395,
2599
+ "step": 370
2600
+ },
2601
+ {
2602
+ "epoch": 0.8082788671023965,
2603
+ "grad_norm": 0.42722266912460327,
2604
+ "learning_rate": 8.987347947234192e-06,
2605
+ "loss": 1.5633,
2606
+ "step": 371
2607
+ },
2608
+ {
2609
+ "epoch": 0.8104575163398693,
2610
+ "grad_norm": 0.44506317377090454,
2611
+ "learning_rate": 8.790424657794034e-06,
2612
+ "loss": 1.5972,
2613
+ "step": 372
2614
+ },
2615
+ {
2616
+ "epoch": 0.8126361655773421,
2617
+ "grad_norm": 0.44512099027633667,
2618
+ "learning_rate": 8.595474628960598e-06,
2619
+ "loss": 1.5575,
2620
+ "step": 373
2621
+ },
2622
+ {
2623
+ "epoch": 0.8148148148148148,
2624
+ "grad_norm": 0.41824987530708313,
2625
+ "learning_rate": 8.402507195632626e-06,
2626
+ "loss": 1.6123,
2627
+ "step": 374
2628
+ },
2629
+ {
2630
+ "epoch": 0.8169934640522876,
2631
+ "grad_norm": 0.4357855021953583,
2632
+ "learning_rate": 8.211531597775134e-06,
2633
+ "loss": 1.6286,
2634
+ "step": 375
2635
+ },
2636
+ {
2637
+ "epoch": 0.8191721132897604,
2638
+ "grad_norm": 0.42029663920402527,
2639
+ "learning_rate": 8.022556979976992e-06,
2640
+ "loss": 1.4422,
2641
+ "step": 376
2642
+ },
2643
+ {
2644
+ "epoch": 0.8213507625272332,
2645
+ "grad_norm": 0.4830271005630493,
2646
+ "learning_rate": 7.835592391013053e-06,
2647
+ "loss": 1.7282,
2648
+ "step": 377
2649
+ },
2650
+ {
2651
+ "epoch": 0.8235294117647058,
2652
+ "grad_norm": 0.45707976818084717,
2653
+ "learning_rate": 7.650646783410875e-06,
2654
+ "loss": 1.456,
2655
+ "step": 378
2656
+ },
2657
+ {
2658
+ "epoch": 0.8257080610021786,
2659
+ "grad_norm": 0.42442944645881653,
2660
+ "learning_rate": 7.4677290130219794e-06,
2661
+ "loss": 1.6168,
2662
+ "step": 379
2663
+ },
2664
+ {
2665
+ "epoch": 0.8278867102396514,
2666
+ "grad_norm": 0.448371946811676,
2667
+ "learning_rate": 7.286847838597905e-06,
2668
+ "loss": 1.6022,
2669
+ "step": 380
2670
+ },
2671
+ {
2672
+ "epoch": 0.8300653594771242,
2673
+ "grad_norm": 0.40033116936683655,
2674
+ "learning_rate": 7.108011921370728e-06,
2675
+ "loss": 1.5397,
2676
+ "step": 381
2677
+ },
2678
+ {
2679
+ "epoch": 0.8322440087145969,
2680
+ "grad_norm": 0.45119813084602356,
2681
+ "learning_rate": 6.9312298246383575e-06,
2682
+ "loss": 1.6904,
2683
+ "step": 382
2684
+ },
2685
+ {
2686
+ "epoch": 0.8344226579520697,
2687
+ "grad_norm": 0.43013447523117065,
2688
+ "learning_rate": 6.7565100133545115e-06,
2689
+ "loss": 1.5514,
2690
+ "step": 383
2691
+ },
2692
+ {
2693
+ "epoch": 0.8366013071895425,
2694
+ "grad_norm": 0.42408111691474915,
2695
+ "learning_rate": 6.583860853723339e-06,
2696
+ "loss": 1.5711,
2697
+ "step": 384
2698
+ },
2699
+ {
2700
+ "epoch": 0.8387799564270153,
2701
+ "grad_norm": 0.45047736167907715,
2702
+ "learning_rate": 6.413290612798884e-06,
2703
+ "loss": 1.5574,
2704
+ "step": 385
2705
+ },
2706
+ {
2707
+ "epoch": 0.840958605664488,
2708
+ "grad_norm": 0.4935206472873688,
2709
+ "learning_rate": 6.24480745808913e-06,
2710
+ "loss": 1.6341,
2711
+ "step": 386
2712
+ },
2713
+ {
2714
+ "epoch": 0.8431372549019608,
2715
+ "grad_norm": 0.4137425124645233,
2716
+ "learning_rate": 6.078419457165036e-06,
2717
+ "loss": 1.5075,
2718
+ "step": 387
2719
+ },
2720
+ {
2721
+ "epoch": 0.8453159041394336,
2722
+ "grad_norm": 0.4301162362098694,
2723
+ "learning_rate": 5.914134577274122e-06,
2724
+ "loss": 1.6337,
2725
+ "step": 388
2726
+ },
2727
+ {
2728
+ "epoch": 0.8474945533769063,
2729
+ "grad_norm": 0.4462296962738037,
2730
+ "learning_rate": 5.751960684959046e-06,
2731
+ "loss": 1.5962,
2732
+ "step": 389
2733
+ },
2734
+ {
2735
+ "epoch": 0.8496732026143791,
2736
+ "grad_norm": 0.44939714670181274,
2737
+ "learning_rate": 5.59190554568087e-06,
2738
+ "loss": 1.5191,
2739
+ "step": 390
2740
+ },
2741
+ {
2742
+ "epoch": 0.8518518518518519,
2743
+ "grad_norm": 0.4732544720172882,
2744
+ "learning_rate": 5.4339768234472625e-06,
2745
+ "loss": 1.5137,
2746
+ "step": 391
2747
+ },
2748
+ {
2749
+ "epoch": 0.8540305010893247,
2750
+ "grad_norm": 0.4495387077331543,
2751
+ "learning_rate": 5.27818208044551e-06,
2752
+ "loss": 1.5465,
2753
+ "step": 392
2754
+ },
2755
+ {
2756
+ "epoch": 0.8562091503267973,
2757
+ "grad_norm": 0.5142303109169006,
2758
+ "learning_rate": 5.124528776680371e-06,
2759
+ "loss": 1.7364,
2760
+ "step": 393
2761
+ },
2762
+ {
2763
+ "epoch": 0.8583877995642701,
2764
+ "grad_norm": 0.4349537789821625,
2765
+ "learning_rate": 4.9730242696169325e-06,
2766
+ "loss": 1.4617,
2767
+ "step": 394
2768
+ },
2769
+ {
2770
+ "epoch": 0.8605664488017429,
2771
+ "grad_norm": 0.46584171056747437,
2772
+ "learning_rate": 4.8236758138282715e-06,
2773
+ "loss": 1.5325,
2774
+ "step": 395
2775
+ },
2776
+ {
2777
+ "epoch": 0.8627450980392157,
2778
+ "grad_norm": 0.5047036409378052,
2779
+ "learning_rate": 4.676490560648067e-06,
2780
+ "loss": 1.5519,
2781
+ "step": 396
2782
+ },
2783
+ {
2784
+ "epoch": 0.8649237472766884,
2785
+ "grad_norm": 0.47344809770584106,
2786
+ "learning_rate": 4.531475557828202e-06,
2787
+ "loss": 1.5025,
2788
+ "step": 397
2789
+ },
2790
+ {
2791
+ "epoch": 0.8671023965141612,
2792
+ "grad_norm": 0.47757408022880554,
2793
+ "learning_rate": 4.388637749201274e-06,
2794
+ "loss": 1.5715,
2795
+ "step": 398
2796
+ },
2797
+ {
2798
+ "epoch": 0.869281045751634,
2799
+ "grad_norm": 0.5043246746063232,
2800
+ "learning_rate": 4.247983974348096e-06,
2801
+ "loss": 1.6822,
2802
+ "step": 399
2803
+ },
2804
+ {
2805
+ "epoch": 0.8714596949891068,
2806
+ "grad_norm": 0.6163721084594727,
2807
+ "learning_rate": 4.109520968270198e-06,
2808
+ "loss": 1.528,
2809
+ "step": 400
2810
+ },
2811
+ {
2812
+ "epoch": 0.8736383442265795,
2813
+ "grad_norm": 0.41478580236434937,
2814
+ "learning_rate": 3.973255361067346e-06,
2815
+ "loss": 1.6695,
2816
+ "step": 401
2817
+ },
2818
+ {
2819
+ "epoch": 0.8758169934640523,
2820
+ "grad_norm": 0.4069858491420746,
2821
+ "learning_rate": 3.839193677620028e-06,
2822
+ "loss": 1.572,
2823
+ "step": 402
2824
+ },
2825
+ {
2826
+ "epoch": 0.8779956427015251,
2827
+ "grad_norm": 0.44524499773979187,
2828
+ "learning_rate": 3.7073423372770753e-06,
2829
+ "loss": 1.6545,
2830
+ "step": 403
2831
+ },
2832
+ {
2833
+ "epoch": 0.8801742919389978,
2834
+ "grad_norm": 0.40681320428848267,
2835
+ "learning_rate": 3.577707653548229e-06,
2836
+ "loss": 1.6042,
2837
+ "step": 404
2838
+ },
2839
+ {
2840
+ "epoch": 0.8823529411764706,
2841
+ "grad_norm": 0.39351528882980347,
2842
+ "learning_rate": 3.4502958338018753e-06,
2843
+ "loss": 1.5133,
2844
+ "step": 405
2845
+ },
2846
+ {
2847
+ "epoch": 0.8845315904139434,
2848
+ "grad_norm": 0.4517339766025543,
2849
+ "learning_rate": 3.325112978967776e-06,
2850
+ "loss": 1.6433,
2851
+ "step": 406
2852
+ },
2853
+ {
2854
+ "epoch": 0.8867102396514162,
2855
+ "grad_norm": 0.406124085187912,
2856
+ "learning_rate": 3.20216508324494e-06,
2857
+ "loss": 1.5657,
2858
+ "step": 407
2859
+ },
2860
+ {
2861
+ "epoch": 0.8888888888888888,
2862
+ "grad_norm": 0.40703821182250977,
2863
+ "learning_rate": 3.081458033814627e-06,
2864
+ "loss": 1.6277,
2865
+ "step": 408
2866
+ },
2867
+ {
2868
+ "epoch": 0.8910675381263616,
2869
+ "grad_norm": 0.44792646169662476,
2870
+ "learning_rate": 2.9629976105584266e-06,
2871
+ "loss": 1.703,
2872
+ "step": 409
2873
+ },
2874
+ {
2875
+ "epoch": 0.8932461873638344,
2876
+ "grad_norm": 0.42939573526382446,
2877
+ "learning_rate": 2.8467894857814814e-06,
2878
+ "loss": 1.6469,
2879
+ "step": 410
2880
+ },
2881
+ {
2882
+ "epoch": 0.8954248366013072,
2883
+ "grad_norm": 0.41463714838027954,
2884
+ "learning_rate": 2.732839223940914e-06,
2885
+ "loss": 1.4841,
2886
+ "step": 411
2887
+ },
2888
+ {
2889
+ "epoch": 0.8976034858387799,
2890
+ "grad_norm": 0.44514474272727966,
2891
+ "learning_rate": 2.621152281379352e-06,
2892
+ "loss": 1.7912,
2893
+ "step": 412
2894
+ },
2895
+ {
2896
+ "epoch": 0.8997821350762527,
2897
+ "grad_norm": 0.4165302515029907,
2898
+ "learning_rate": 2.511734006063682e-06,
2899
+ "loss": 1.5175,
2900
+ "step": 413
2901
+ },
2902
+ {
2903
+ "epoch": 0.9019607843137255,
2904
+ "grad_norm": 0.40819787979125977,
2905
+ "learning_rate": 2.4045896373289467e-06,
2906
+ "loss": 1.5655,
2907
+ "step": 414
2908
+ },
2909
+ {
2910
+ "epoch": 0.9041394335511983,
2911
+ "grad_norm": 0.4505191445350647,
2912
+ "learning_rate": 2.299724305627482e-06,
2913
+ "loss": 1.5763,
2914
+ "step": 415
2915
+ },
2916
+ {
2917
+ "epoch": 0.906318082788671,
2918
+ "grad_norm": 0.42655885219573975,
2919
+ "learning_rate": 2.1971430322832554e-06,
2920
+ "loss": 1.536,
2921
+ "step": 416
2922
+ },
2923
+ {
2924
+ "epoch": 0.9084967320261438,
2925
+ "grad_norm": 0.4479508101940155,
2926
+ "learning_rate": 2.0968507292514037e-06,
2927
+ "loss": 1.574,
2928
+ "step": 417
2929
+ },
2930
+ {
2931
+ "epoch": 0.9106753812636166,
2932
+ "grad_norm": 0.39721792936325073,
2933
+ "learning_rate": 1.998852198883061e-06,
2934
+ "loss": 1.5643,
2935
+ "step": 418
2936
+ },
2937
+ {
2938
+ "epoch": 0.9128540305010894,
2939
+ "grad_norm": 0.4025076627731323,
2940
+ "learning_rate": 1.903152133695385e-06,
2941
+ "loss": 1.5924,
2942
+ "step": 419
2943
+ },
2944
+ {
2945
+ "epoch": 0.9150326797385621,
2946
+ "grad_norm": 0.426258385181427,
2947
+ "learning_rate": 1.8097551161468773e-06,
2948
+ "loss": 1.5415,
2949
+ "step": 420
2950
+ },
2951
+ {
2952
+ "epoch": 0.9172113289760349,
2953
+ "grad_norm": 0.39357250928878784,
2954
+ "learning_rate": 1.7186656184179472e-06,
2955
+ "loss": 1.5431,
2956
+ "step": 421
2957
+ },
2958
+ {
2959
+ "epoch": 0.9193899782135077,
2960
+ "grad_norm": 0.4268351197242737,
2961
+ "learning_rate": 1.6298880021967665e-06,
2962
+ "loss": 1.6426,
2963
+ "step": 422
2964
+ },
2965
+ {
2966
+ "epoch": 0.9215686274509803,
2967
+ "grad_norm": 0.43658286333084106,
2968
+ "learning_rate": 1.543426518470431e-06,
2969
+ "loss": 1.5601,
2970
+ "step": 423
2971
+ },
2972
+ {
2973
+ "epoch": 0.9237472766884531,
2974
+ "grad_norm": 0.4358772337436676,
2975
+ "learning_rate": 1.4592853073214008e-06,
2976
+ "loss": 1.6446,
2977
+ "step": 424
2978
+ },
2979
+ {
2980
+ "epoch": 0.9259259259259259,
2981
+ "grad_norm": 0.4013616740703583,
2982
+ "learning_rate": 1.3774683977292424e-06,
2983
+ "loss": 1.4854,
2984
+ "step": 425
2985
+ },
2986
+ {
2987
+ "epoch": 0.9281045751633987,
2988
+ "grad_norm": 0.4512793719768524,
2989
+ "learning_rate": 1.2979797073777334e-06,
2990
+ "loss": 1.6231,
2991
+ "step": 426
2992
+ },
2993
+ {
2994
+ "epoch": 0.9302832244008714,
2995
+ "grad_norm": 0.41178834438323975,
2996
+ "learning_rate": 1.2208230424672562e-06,
2997
+ "loss": 1.4498,
2998
+ "step": 427
2999
+ },
3000
+ {
3001
+ "epoch": 0.9324618736383442,
3002
+ "grad_norm": 0.45710593461990356,
3003
+ "learning_rate": 1.1460020975325391e-06,
3004
+ "loss": 1.672,
3005
+ "step": 428
3006
+ },
3007
+ {
3008
+ "epoch": 0.934640522875817,
3009
+ "grad_norm": 0.43900266289711,
3010
+ "learning_rate": 1.0735204552657642e-06,
3011
+ "loss": 1.6774,
3012
+ "step": 429
3013
+ },
3014
+ {
3015
+ "epoch": 0.9368191721132898,
3016
+ "grad_norm": 0.46783003211021423,
3017
+ "learning_rate": 1.0033815863449981e-06,
3018
+ "loss": 1.5636,
3019
+ "step": 430
3020
+ },
3021
+ {
3022
+ "epoch": 0.9389978213507625,
3023
+ "grad_norm": 0.425657719373703,
3024
+ "learning_rate": 9.355888492680154e-07,
3025
+ "loss": 1.724,
3026
+ "step": 431
3027
+ },
3028
+ {
3029
+ "epoch": 0.9411764705882353,
3030
+ "grad_norm": 0.4375082552433014,
3031
+ "learning_rate": 8.701454901914762e-07,
3032
+ "loss": 1.5833,
3033
+ "step": 432
3034
+ },
3035
+ {
3036
+ "epoch": 0.9433551198257081,
3037
+ "grad_norm": 0.4675833582878113,
3038
+ "learning_rate": 8.0705464277549e-07,
3039
+ "loss": 1.5491,
3040
+ "step": 433
3041
+ },
3042
+ {
3043
+ "epoch": 0.9455337690631809,
3044
+ "grad_norm": 0.40916916728019714,
3045
+ "learning_rate": 7.463193280335679e-07,
3046
+ "loss": 1.5551,
3047
+ "step": 434
3048
+ },
3049
+ {
3050
+ "epoch": 0.9477124183006536,
3051
+ "grad_norm": 0.4476478099822998,
3052
+ "learning_rate": 6.879424541879676e-07,
3053
+ "loss": 1.3743,
3054
+ "step": 435
3055
+ },
3056
+ {
3057
+ "epoch": 0.9498910675381264,
3058
+ "grad_norm": 0.46898120641708374,
3059
+ "learning_rate": 6.319268165304204e-07,
3060
+ "loss": 1.6789,
3061
+ "step": 436
3062
+ },
3063
+ {
3064
+ "epoch": 0.9520697167755992,
3065
+ "grad_norm": 0.4508022665977478,
3066
+ "learning_rate": 5.782750972883111e-07,
3067
+ "loss": 1.6429,
3068
+ "step": 437
3069
+ },
3070
+ {
3071
+ "epoch": 0.954248366013072,
3072
+ "grad_norm": 0.4190502464771271,
3073
+ "learning_rate": 5.26989865496208e-07,
3074
+ "loss": 1.5525,
3075
+ "step": 438
3076
+ },
3077
+ {
3078
+ "epoch": 0.9564270152505446,
3079
+ "grad_norm": 0.4985792934894562,
3080
+ "learning_rate": 4.780735768728895e-07,
3081
+ "loss": 1.6118,
3082
+ "step": 439
3083
+ },
3084
+ {
3085
+ "epoch": 0.9586056644880174,
3086
+ "grad_norm": 0.4321351945400238,
3087
+ "learning_rate": 4.3152857370371557e-07,
3088
+ "loss": 1.6268,
3089
+ "step": 440
3090
+ },
3091
+ {
3092
+ "epoch": 0.9607843137254902,
3093
+ "grad_norm": 0.44800451397895813,
3094
+ "learning_rate": 3.873570847285013e-07,
3095
+ "loss": 1.4648,
3096
+ "step": 441
3097
+ },
3098
+ {
3099
+ "epoch": 0.9629629629629629,
3100
+ "grad_norm": 0.44108688831329346,
3101
+ "learning_rate": 3.455612250347851e-07,
3102
+ "loss": 1.5555,
3103
+ "step": 442
3104
+ },
3105
+ {
3106
+ "epoch": 0.9651416122004357,
3107
+ "grad_norm": 0.4282665550708771,
3108
+ "learning_rate": 3.061429959565487e-07,
3109
+ "loss": 1.4951,
3110
+ "step": 443
3111
+ },
3112
+ {
3113
+ "epoch": 0.9673202614379085,
3114
+ "grad_norm": 0.4284363389015198,
3115
+ "learning_rate": 2.691042849783776e-07,
3116
+ "loss": 1.5738,
3117
+ "step": 444
3118
+ },
3119
+ {
3120
+ "epoch": 0.9694989106753813,
3121
+ "grad_norm": 0.504803478717804,
3122
+ "learning_rate": 2.3444686564511043e-07,
3123
+ "loss": 1.5243,
3124
+ "step": 445
3125
+ },
3126
+ {
3127
+ "epoch": 0.971677559912854,
3128
+ "grad_norm": 0.4652966558933258,
3129
+ "learning_rate": 2.0217239747689078e-07,
3130
+ "loss": 1.5924,
3131
+ "step": 446
3132
+ },
3133
+ {
3134
+ "epoch": 0.9738562091503268,
3135
+ "grad_norm": 0.47274574637413025,
3136
+ "learning_rate": 1.722824258896971e-07,
3137
+ "loss": 1.4842,
3138
+ "step": 447
3139
+ },
3140
+ {
3141
+ "epoch": 0.9760348583877996,
3142
+ "grad_norm": 0.4845008850097656,
3143
+ "learning_rate": 1.447783821213744e-07,
3144
+ "loss": 1.5003,
3145
+ "step": 448
3146
+ },
3147
+ {
3148
+ "epoch": 0.9782135076252724,
3149
+ "grad_norm": 0.4765538275241852,
3150
+ "learning_rate": 1.1966158316307208e-07,
3151
+ "loss": 1.6308,
3152
+ "step": 449
3153
+ },
3154
+ {
3155
+ "epoch": 0.9803921568627451,
3156
+ "grad_norm": 0.5732904672622681,
3157
+ "learning_rate": 9.693323169619462e-08,
3158
+ "loss": 1.6595,
3159
+ "step": 450
3160
+ },
3161
+ {
3162
+ "epoch": 0.9825708061002179,
3163
+ "grad_norm": 0.4185521602630615,
3164
+ "learning_rate": 7.659441603481421e-08,
3165
+ "loss": 1.5248,
3166
+ "step": 451
3167
+ },
3168
+ {
3169
+ "epoch": 0.9847494553376906,
3170
+ "grad_norm": 0.43158915638923645,
3171
+ "learning_rate": 5.864611007354581e-08,
3172
+ "loss": 1.5917,
3173
+ "step": 452
3174
+ },
3175
+ {
3176
+ "epoch": 0.9869281045751634,
3177
+ "grad_norm": 0.4398334324359894,
3178
+ "learning_rate": 4.308917324092887e-08,
3179
+ "loss": 1.7976,
3180
+ "step": 453
3181
+ },
3182
+ {
3183
+ "epoch": 0.9891067538126361,
3184
+ "grad_norm": 0.44609612226486206,
3185
+ "learning_rate": 2.9924350458271357e-08,
3186
+ "loss": 1.5595,
3187
+ "step": 454
3188
+ },
3189
+ {
3190
+ "epoch": 0.9912854030501089,
3191
+ "grad_norm": 0.4355579912662506,
3192
+ "learning_rate": 1.9152272103972747e-08,
3193
+ "loss": 1.6064,
3194
+ "step": 455
3195
+ },
3196
+ {
3197
+ "epoch": 0.9934640522875817,
3198
+ "grad_norm": 0.4061749279499054,
3199
+ "learning_rate": 1.0773453983342618e-08,
3200
+ "loss": 1.658,
3201
+ "step": 456
3202
+ },
3203
+ {
3204
+ "epoch": 0.9956427015250545,
3205
+ "grad_norm": 0.41987109184265137,
3206
+ "learning_rate": 4.7882973039037326e-09,
3207
+ "loss": 1.5099,
3208
+ "step": 457
3209
+ },
3210
+ {
3211
+ "epoch": 0.9978213507625272,
3212
+ "grad_norm": 0.4400121569633484,
3213
+ "learning_rate": 1.1970886561907258e-09,
3214
+ "loss": 1.5026,
3215
+ "step": 458
3216
+ },
3217
+ {
3218
+ "epoch": 1.0,
3219
+ "grad_norm": 0.47257599234580994,
3220
+ "learning_rate": 0.0,
3221
+ "loss": 1.6942,
3222
+ "step": 459
3223
  }
3224
  ],
3225
  "logging_steps": 1,
 
3234
  "should_evaluate": false,
3235
  "should_log": false,
3236
  "should_save": true,
3237
+ "should_training_stop": true
3238
  },
3239
  "attributes": {}
3240
  }
3241
  },
3242
+ "total_flos": 4931631526182912.0,
3243
  "train_batch_size": 4,
3244
  "trial_name": null,
3245
  "trial_params": null