NBAmine commited on
Commit
87d8e0b
·
verified ·
1 Parent(s): c256cf5

Training in progress, step 2700, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d4ddfdb9e3869897cc8e2c794340a2005ba76c5f50e34e53325b8ac99f6dc318
3
  size 228140600
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df0b3c057589426de11702e8aa51f40578fbdc1c16b5298b4df1b3741a358543
3
  size 228140600
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4e830b2069082bb840c6d5f287e7770c0cf0e2e3f80198ae73bbef00b14811db
3
  size 117931203
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e1a2a35f3f40624f11f416233f78a070b1dea29da95a3a90a9a787a9173de3d
3
  size 117931203
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e39d866cd1fc861fe2c47687364cde08217b0454e6f5ff3c9a3af4b1571fdbed
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:54ee403e6e7f52e165fb91ab2843ca4f38ca3d3c64d81b59c5a39f9e4c098413
3
  size 14645
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:137d8a11890df77c4e1b6a4687bee089955dbcdddb421d49b265e762ccebb1d2
3
  size 1383
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88be0f049d620e88b111c309644f5ca8c552ca0e64dbf5a41f67ac4dd14016eb
3
  size 1383
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7a3c006c2c7c0bc33914c8e11069f53d495f2eafa42ba0a076cb7cebbe066c7a
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6abcf0c15a7ba90c608cb1903d96b4ad18eb9806fb694a46be4e23a52b64410b
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": 750,
3
  "best_metric": 0.5089643597602844,
4
  "best_model_checkpoint": "./adapter-phase1/checkpoint-750",
5
- "epoch": 3.84,
6
  "eval_steps": 300,
7
- "global_step": 2400,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -2564,6 +2564,318 @@
2564
  "eval_samples_per_second": 2.301,
2565
  "eval_steps_per_second": 0.575,
2566
  "step": 2400
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2567
  }
2568
  ],
2569
  "logging_steps": 10,
@@ -2583,7 +2895,7 @@
2583
  "attributes": {}
2584
  }
2585
  },
2586
- "total_flos": 4.143800723056128e+17,
2587
  "train_batch_size": 1,
2588
  "trial_name": null,
2589
  "trial_params": null
 
2
  "best_global_step": 750,
3
  "best_metric": 0.5089643597602844,
4
  "best_model_checkpoint": "./adapter-phase1/checkpoint-750",
5
+ "epoch": 4.32,
6
  "eval_steps": 300,
7
+ "global_step": 2700,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
2564
  "eval_samples_per_second": 2.301,
2565
  "eval_steps_per_second": 0.575,
2566
  "step": 2400
2567
+ },
2568
+ {
2569
+ "entropy": 0.2655953477136791,
2570
+ "epoch": 3.856,
2571
+ "grad_norm": 0.8277497291564941,
2572
+ "learning_rate": 2.2976e-05,
2573
+ "loss": 0.2109,
2574
+ "mean_token_accuracy": 0.9393812574446201,
2575
+ "num_tokens": 1011268.0,
2576
+ "step": 2410
2577
+ },
2578
+ {
2579
+ "entropy": 0.2920296056661755,
2580
+ "epoch": 3.872,
2581
+ "grad_norm": 1.015434980392456,
2582
+ "learning_rate": 2.2656e-05,
2583
+ "loss": 0.2243,
2584
+ "mean_token_accuracy": 0.9357186656445264,
2585
+ "num_tokens": 1026942.0,
2586
+ "step": 2420
2587
+ },
2588
+ {
2589
+ "entropy": 0.2859017666429281,
2590
+ "epoch": 3.888,
2591
+ "grad_norm": 0.6656726002693176,
2592
+ "learning_rate": 2.2336e-05,
2593
+ "loss": 0.2389,
2594
+ "mean_token_accuracy": 0.9283736657351256,
2595
+ "num_tokens": 1053937.0,
2596
+ "step": 2430
2597
+ },
2598
+ {
2599
+ "entropy": 0.24961302392184734,
2600
+ "epoch": 3.904,
2601
+ "grad_norm": 0.8390278816223145,
2602
+ "learning_rate": 2.2016e-05,
2603
+ "loss": 0.2211,
2604
+ "mean_token_accuracy": 0.9312011521309614,
2605
+ "num_tokens": 1084820.0,
2606
+ "step": 2440
2607
+ },
2608
+ {
2609
+ "entropy": 0.2519187033176422,
2610
+ "epoch": 3.92,
2611
+ "grad_norm": 0.8542287349700928,
2612
+ "learning_rate": 2.1696e-05,
2613
+ "loss": 0.2126,
2614
+ "mean_token_accuracy": 0.9375488836318254,
2615
+ "num_tokens": 1109943.0,
2616
+ "step": 2450
2617
+ },
2618
+ {
2619
+ "entropy": 0.27277124775573613,
2620
+ "epoch": 3.936,
2621
+ "grad_norm": 0.9245595335960388,
2622
+ "learning_rate": 2.1376e-05,
2623
+ "loss": 0.2161,
2624
+ "mean_token_accuracy": 0.9364014331251382,
2625
+ "num_tokens": 1130543.0,
2626
+ "step": 2460
2627
+ },
2628
+ {
2629
+ "entropy": 0.28273853762075307,
2630
+ "epoch": 3.952,
2631
+ "grad_norm": 0.9764724969863892,
2632
+ "learning_rate": 2.1056e-05,
2633
+ "loss": 0.2217,
2634
+ "mean_token_accuracy": 0.9356040749698877,
2635
+ "num_tokens": 1146676.0,
2636
+ "step": 2470
2637
+ },
2638
+ {
2639
+ "entropy": 0.2879827093333006,
2640
+ "epoch": 3.968,
2641
+ "grad_norm": 0.7532303929328918,
2642
+ "learning_rate": 2.0736e-05,
2643
+ "loss": 0.2413,
2644
+ "mean_token_accuracy": 0.9290374431759119,
2645
+ "num_tokens": 1172078.0,
2646
+ "step": 2480
2647
+ },
2648
+ {
2649
+ "entropy": 0.2530561724677682,
2650
+ "epoch": 3.984,
2651
+ "grad_norm": 0.8568546175956726,
2652
+ "learning_rate": 2.0416000000000002e-05,
2653
+ "loss": 0.2177,
2654
+ "mean_token_accuracy": 0.9337470591068268,
2655
+ "num_tokens": 1197464.0,
2656
+ "step": 2490
2657
+ },
2658
+ {
2659
+ "entropy": 0.3038310568779707,
2660
+ "epoch": 4.0,
2661
+ "grad_norm": 0.9622617959976196,
2662
+ "learning_rate": 2.0096000000000002e-05,
2663
+ "loss": 0.2368,
2664
+ "mean_token_accuracy": 0.9296225290745497,
2665
+ "num_tokens": 1212204.0,
2666
+ "step": 2500
2667
+ },
2668
+ {
2669
+ "entropy": 0.24809251818805933,
2670
+ "epoch": 4.016,
2671
+ "grad_norm": 0.8197008371353149,
2672
+ "learning_rate": 1.9776000000000002e-05,
2673
+ "loss": 0.2395,
2674
+ "mean_token_accuracy": 0.928604032099247,
2675
+ "num_tokens": 1253458.0,
2676
+ "step": 2510
2677
+ },
2678
+ {
2679
+ "entropy": 0.24905966678634286,
2680
+ "epoch": 4.032,
2681
+ "grad_norm": 0.8056384921073914,
2682
+ "learning_rate": 1.9456e-05,
2683
+ "loss": 0.2301,
2684
+ "mean_token_accuracy": 0.9330911111086607,
2685
+ "num_tokens": 1282365.0,
2686
+ "step": 2520
2687
+ },
2688
+ {
2689
+ "entropy": 0.26601817598566413,
2690
+ "epoch": 4.048,
2691
+ "grad_norm": 0.9766417145729065,
2692
+ "learning_rate": 1.9136e-05,
2693
+ "loss": 0.2237,
2694
+ "mean_token_accuracy": 0.9384452097117901,
2695
+ "num_tokens": 1305420.0,
2696
+ "step": 2530
2697
+ },
2698
+ {
2699
+ "entropy": 0.28673125999048354,
2700
+ "epoch": 4.064,
2701
+ "grad_norm": 1.2241604328155518,
2702
+ "learning_rate": 1.8816e-05,
2703
+ "loss": 0.2615,
2704
+ "mean_token_accuracy": 0.9268214203417301,
2705
+ "num_tokens": 1323367.0,
2706
+ "step": 2540
2707
+ },
2708
+ {
2709
+ "entropy": 0.3297149523161352,
2710
+ "epoch": 4.08,
2711
+ "grad_norm": 1.2444630861282349,
2712
+ "learning_rate": 1.8496000000000004e-05,
2713
+ "loss": 0.266,
2714
+ "mean_token_accuracy": 0.9285014558583498,
2715
+ "num_tokens": 1335370.0,
2716
+ "step": 2550
2717
+ },
2718
+ {
2719
+ "entropy": 0.25180468857288363,
2720
+ "epoch": 4.096,
2721
+ "grad_norm": 0.6901214718818665,
2722
+ "learning_rate": 1.8176e-05,
2723
+ "loss": 0.2242,
2724
+ "mean_token_accuracy": 0.9317782554775477,
2725
+ "num_tokens": 1374567.0,
2726
+ "step": 2560
2727
+ },
2728
+ {
2729
+ "entropy": 0.25819407450035214,
2730
+ "epoch": 4.112,
2731
+ "grad_norm": 0.8702373504638672,
2732
+ "learning_rate": 1.7856e-05,
2733
+ "loss": 0.2344,
2734
+ "mean_token_accuracy": 0.9326971143484115,
2735
+ "num_tokens": 1402608.0,
2736
+ "step": 2570
2737
+ },
2738
+ {
2739
+ "entropy": 0.26549670435488226,
2740
+ "epoch": 4.128,
2741
+ "grad_norm": 0.7631207704544067,
2742
+ "learning_rate": 1.7536e-05,
2743
+ "loss": 0.2297,
2744
+ "mean_token_accuracy": 0.9365796335041523,
2745
+ "num_tokens": 1425524.0,
2746
+ "step": 2580
2747
+ },
2748
+ {
2749
+ "entropy": 0.26975566176697613,
2750
+ "epoch": 4.144,
2751
+ "grad_norm": 1.1718668937683105,
2752
+ "learning_rate": 1.7216000000000003e-05,
2753
+ "loss": 0.221,
2754
+ "mean_token_accuracy": 0.9397962510585784,
2755
+ "num_tokens": 1444092.0,
2756
+ "step": 2590
2757
+ },
2758
+ {
2759
+ "entropy": 0.3168819394893944,
2760
+ "epoch": 4.16,
2761
+ "grad_norm": 1.0534077882766724,
2762
+ "learning_rate": 1.6896000000000002e-05,
2763
+ "loss": 0.2544,
2764
+ "mean_token_accuracy": 0.9319371480494738,
2765
+ "num_tokens": 1456844.0,
2766
+ "step": 2600
2767
+ },
2768
+ {
2769
+ "entropy": 0.25265237540006635,
2770
+ "epoch": 4.176,
2771
+ "grad_norm": 0.7592364549636841,
2772
+ "learning_rate": 1.6576e-05,
2773
+ "loss": 0.2395,
2774
+ "mean_token_accuracy": 0.9289916418492794,
2775
+ "num_tokens": 1496545.0,
2776
+ "step": 2610
2777
+ },
2778
+ {
2779
+ "entropy": 0.2543726827017963,
2780
+ "epoch": 4.192,
2781
+ "grad_norm": 0.9639586210250854,
2782
+ "learning_rate": 1.6256e-05,
2783
+ "loss": 0.2351,
2784
+ "mean_token_accuracy": 0.9337568439543247,
2785
+ "num_tokens": 1525103.0,
2786
+ "step": 2620
2787
+ },
2788
+ {
2789
+ "entropy": 0.26547051025554536,
2790
+ "epoch": 4.208,
2791
+ "grad_norm": 0.9620559215545654,
2792
+ "learning_rate": 1.5936e-05,
2793
+ "loss": 0.2382,
2794
+ "mean_token_accuracy": 0.9348125293850899,
2795
+ "num_tokens": 1548306.0,
2796
+ "step": 2630
2797
+ },
2798
+ {
2799
+ "entropy": 0.27369030360132457,
2800
+ "epoch": 4.224,
2801
+ "grad_norm": 0.8373218774795532,
2802
+ "learning_rate": 1.5616e-05,
2803
+ "loss": 0.2254,
2804
+ "mean_token_accuracy": 0.9375662509351969,
2805
+ "num_tokens": 1566990.0,
2806
+ "step": 2640
2807
+ },
2808
+ {
2809
+ "entropy": 0.3024815677665174,
2810
+ "epoch": 4.24,
2811
+ "grad_norm": 1.3148176670074463,
2812
+ "learning_rate": 1.5296e-05,
2813
+ "loss": 0.2391,
2814
+ "mean_token_accuracy": 0.9351990919560194,
2815
+ "num_tokens": 1580065.0,
2816
+ "step": 2650
2817
+ },
2818
+ {
2819
+ "entropy": 0.2600595161318779,
2820
+ "epoch": 4.256,
2821
+ "grad_norm": 0.6774656176567078,
2822
+ "learning_rate": 1.4976000000000002e-05,
2823
+ "loss": 0.2377,
2824
+ "mean_token_accuracy": 0.9274554952979088,
2825
+ "num_tokens": 1619083.0,
2826
+ "step": 2660
2827
+ },
2828
+ {
2829
+ "entropy": 0.26013899641111493,
2830
+ "epoch": 4.272,
2831
+ "grad_norm": 0.9727310538291931,
2832
+ "learning_rate": 1.4656e-05,
2833
+ "loss": 0.2294,
2834
+ "mean_token_accuracy": 0.934112536534667,
2835
+ "num_tokens": 1646970.0,
2836
+ "step": 2670
2837
+ },
2838
+ {
2839
+ "entropy": 0.25867203902453184,
2840
+ "epoch": 4.288,
2841
+ "grad_norm": 0.9198706150054932,
2842
+ "learning_rate": 1.4336e-05,
2843
+ "loss": 0.2184,
2844
+ "mean_token_accuracy": 0.9373745564371347,
2845
+ "num_tokens": 1669364.0,
2846
+ "step": 2680
2847
+ },
2848
+ {
2849
+ "entropy": 0.26432402124628424,
2850
+ "epoch": 4.304,
2851
+ "grad_norm": 0.9908862709999084,
2852
+ "learning_rate": 1.4016000000000001e-05,
2853
+ "loss": 0.2195,
2854
+ "mean_token_accuracy": 0.9392576098442078,
2855
+ "num_tokens": 1687812.0,
2856
+ "step": 2690
2857
+ },
2858
+ {
2859
+ "entropy": 0.30741472546942533,
2860
+ "epoch": 4.32,
2861
+ "grad_norm": 1.0388495922088623,
2862
+ "learning_rate": 1.3696e-05,
2863
+ "loss": 0.2503,
2864
+ "mean_token_accuracy": 0.9325483400374651,
2865
+ "num_tokens": 1700598.0,
2866
+ "step": 2700
2867
+ },
2868
+ {
2869
+ "epoch": 4.32,
2870
+ "eval_accuracy": 0.02638358121882313,
2871
+ "eval_entropy": 0.3719751555919647,
2872
+ "eval_loss": 0.5846644043922424,
2873
+ "eval_mean_token_accuracy": 0.8568292667865753,
2874
+ "eval_num_tokens": 1700598.0,
2875
+ "eval_runtime": 869.8497,
2876
+ "eval_samples_per_second": 2.299,
2877
+ "eval_steps_per_second": 0.575,
2878
+ "step": 2700
2879
  }
2880
  ],
2881
  "logging_steps": 10,
 
2895
  "attributes": {}
2896
  }
2897
  },
2898
+ "total_flos": 4.639214588564275e+17,
2899
  "train_batch_size": 1,
2900
  "trial_name": null,
2901
  "trial_params": null