CocoRoF commited on
Commit
94b6891
·
verified ·
1 Parent(s): 276b80f

Training in progress, step 3500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0336b9d4a5405b35eb41810e914f8235995602c3b470eb98cb5172e5614a1617
3
  size 737580392
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c03dfe3ff98720b641d5b3253f189443475f90c5848bfce1ee42b4e25e9a06d9
3
  size 737580392
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bb14199c8ed85d5530890aaca81a88b88623101addc71c4dba17e1262410aecb
3
  size 1475248442
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:781ca001e4eef0894d5dc0a043ec1d7414e5f687b44a3bb27578a66df794e142
3
  size 1475248442
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9819055317e0aa1215ad120239bc4cecc175225c0dc18c98ca0bffe9f465133f
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8cd02421b7ec256714ec03c37d51589e92544068eeda4bae107d407e8dfd0cb9
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7947fe218b4344129921368e2448c6474704c87d577f328a448eabc5c93d4cc3
3
  size 1000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8785b8509dc9a197581e45af973f623b343ec6de3eb0eeab89b29a64ed0e10d5
3
  size 1000
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.4058106841611997,
5
  "eval_steps": 100,
6
- "global_step": 3000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -2587,6 +2587,436 @@
2587
  "eval_spearman_manhattan": 0.8242741076199507,
2588
  "eval_steps_per_second": 15.409,
2589
  "step": 3000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2590
  }
2591
  ],
2592
  "logging_steps": 10,
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.640112464854733,
5
  "eval_steps": 100,
6
+ "global_step": 3500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
2587
  "eval_spearman_manhattan": 0.8242741076199507,
2588
  "eval_steps_per_second": 15.409,
2589
  "step": 3000
2590
+ },
2591
+ {
2592
+ "epoch": 1.4104967197750704,
2593
+ "grad_norm": 1.681735873222351,
2594
+ "learning_rate": 4.559219775070291e-05,
2595
+ "loss": 0.2028,
2596
+ "step": 3010
2597
+ },
2598
+ {
2599
+ "epoch": 1.415182755388941,
2600
+ "grad_norm": 1.349747896194458,
2601
+ "learning_rate": 4.5577553889409565e-05,
2602
+ "loss": 0.1849,
2603
+ "step": 3020
2604
+ },
2605
+ {
2606
+ "epoch": 1.4198687910028116,
2607
+ "grad_norm": 1.752896785736084,
2608
+ "learning_rate": 4.556291002811622e-05,
2609
+ "loss": 0.1939,
2610
+ "step": 3030
2611
+ },
2612
+ {
2613
+ "epoch": 1.4245548266166823,
2614
+ "grad_norm": 1.538013219833374,
2615
+ "learning_rate": 4.5548266166822874e-05,
2616
+ "loss": 0.1711,
2617
+ "step": 3040
2618
+ },
2619
+ {
2620
+ "epoch": 1.429240862230553,
2621
+ "grad_norm": 1.6929601430892944,
2622
+ "learning_rate": 4.553362230552952e-05,
2623
+ "loss": 0.1835,
2624
+ "step": 3050
2625
+ },
2626
+ {
2627
+ "epoch": 1.4339268978444237,
2628
+ "grad_norm": 1.9702001810073853,
2629
+ "learning_rate": 4.5518978444236176e-05,
2630
+ "loss": 0.1968,
2631
+ "step": 3060
2632
+ },
2633
+ {
2634
+ "epoch": 1.4386129334582942,
2635
+ "grad_norm": 1.5056127309799194,
2636
+ "learning_rate": 4.550433458294283e-05,
2637
+ "loss": 0.1856,
2638
+ "step": 3070
2639
+ },
2640
+ {
2641
+ "epoch": 1.443298969072165,
2642
+ "grad_norm": 1.8639978170394897,
2643
+ "learning_rate": 4.5489690721649484e-05,
2644
+ "loss": 0.1854,
2645
+ "step": 3080
2646
+ },
2647
+ {
2648
+ "epoch": 1.4479850046860356,
2649
+ "grad_norm": 1.6516441106796265,
2650
+ "learning_rate": 4.547504686035614e-05,
2651
+ "loss": 0.1816,
2652
+ "step": 3090
2653
+ },
2654
+ {
2655
+ "epoch": 1.4526710402999063,
2656
+ "grad_norm": 1.3151347637176514,
2657
+ "learning_rate": 4.546040299906279e-05,
2658
+ "loss": 0.1777,
2659
+ "step": 3100
2660
+ },
2661
+ {
2662
+ "epoch": 1.4526710402999063,
2663
+ "eval_loss": 0.036029454320669174,
2664
+ "eval_pearson_cosine": 0.8219421674547647,
2665
+ "eval_pearson_dot": 0.7544857842745216,
2666
+ "eval_pearson_euclidean": 0.8154423230591377,
2667
+ "eval_pearson_manhattan": 0.8169259606030721,
2668
+ "eval_runtime": 5.9655,
2669
+ "eval_samples_per_second": 251.446,
2670
+ "eval_spearman_cosine": 0.8227245969216198,
2671
+ "eval_spearman_dot": 0.7556651515459966,
2672
+ "eval_spearman_euclidean": 0.8221360306356487,
2673
+ "eval_spearman_manhattan": 0.8232106684973721,
2674
+ "eval_steps_per_second": 15.757,
2675
+ "step": 3100
2676
+ },
2677
+ {
2678
+ "epoch": 1.457357075913777,
2679
+ "grad_norm": 1.5906847715377808,
2680
+ "learning_rate": 4.544575913776945e-05,
2681
+ "loss": 0.2292,
2682
+ "step": 3110
2683
+ },
2684
+ {
2685
+ "epoch": 1.4620431115276475,
2686
+ "grad_norm": 1.2340494394302368,
2687
+ "learning_rate": 4.543111527647611e-05,
2688
+ "loss": 0.1885,
2689
+ "step": 3120
2690
+ },
2691
+ {
2692
+ "epoch": 1.4667291471415183,
2693
+ "grad_norm": 1.7530288696289062,
2694
+ "learning_rate": 4.541647141518276e-05,
2695
+ "loss": 0.2088,
2696
+ "step": 3130
2697
+ },
2698
+ {
2699
+ "epoch": 1.471415182755389,
2700
+ "grad_norm": 1.7493581771850586,
2701
+ "learning_rate": 4.540182755388942e-05,
2702
+ "loss": 0.1704,
2703
+ "step": 3140
2704
+ },
2705
+ {
2706
+ "epoch": 1.4761012183692597,
2707
+ "grad_norm": 1.5426673889160156,
2708
+ "learning_rate": 4.5387183692596064e-05,
2709
+ "loss": 0.1743,
2710
+ "step": 3150
2711
+ },
2712
+ {
2713
+ "epoch": 1.4807872539831304,
2714
+ "grad_norm": 1.4274178743362427,
2715
+ "learning_rate": 4.537253983130272e-05,
2716
+ "loss": 0.1929,
2717
+ "step": 3160
2718
+ },
2719
+ {
2720
+ "epoch": 1.4854732895970009,
2721
+ "grad_norm": 1.372902512550354,
2722
+ "learning_rate": 4.535789597000937e-05,
2723
+ "loss": 0.1913,
2724
+ "step": 3170
2725
+ },
2726
+ {
2727
+ "epoch": 1.4901593252108716,
2728
+ "grad_norm": 2.073024272918701,
2729
+ "learning_rate": 4.534325210871603e-05,
2730
+ "loss": 0.2104,
2731
+ "step": 3180
2732
+ },
2733
+ {
2734
+ "epoch": 1.4948453608247423,
2735
+ "grad_norm": 1.5448635816574097,
2736
+ "learning_rate": 4.532860824742268e-05,
2737
+ "loss": 0.2018,
2738
+ "step": 3190
2739
+ },
2740
+ {
2741
+ "epoch": 1.499531396438613,
2742
+ "grad_norm": 1.4171442985534668,
2743
+ "learning_rate": 4.5313964386129336e-05,
2744
+ "loss": 0.1816,
2745
+ "step": 3200
2746
+ },
2747
+ {
2748
+ "epoch": 1.499531396438613,
2749
+ "eval_loss": 0.036359407007694244,
2750
+ "eval_pearson_cosine": 0.821300031874455,
2751
+ "eval_pearson_dot": 0.7616346380196859,
2752
+ "eval_pearson_euclidean": 0.8169013356741246,
2753
+ "eval_pearson_manhattan": 0.8184860654262422,
2754
+ "eval_runtime": 5.9202,
2755
+ "eval_samples_per_second": 253.369,
2756
+ "eval_spearman_cosine": 0.8228088882125945,
2757
+ "eval_spearman_dot": 0.7590379988222679,
2758
+ "eval_spearman_euclidean": 0.8237353804832064,
2759
+ "eval_spearman_manhattan": 0.82472345338271,
2760
+ "eval_steps_per_second": 15.878,
2761
+ "step": 3200
2762
+ },
2763
+ {
2764
+ "epoch": 1.5042174320524837,
2765
+ "grad_norm": 1.7904757261276245,
2766
+ "learning_rate": 4.529932052483599e-05,
2767
+ "loss": 0.1955,
2768
+ "step": 3210
2769
+ },
2770
+ {
2771
+ "epoch": 1.5089034676663542,
2772
+ "grad_norm": 2.021733522415161,
2773
+ "learning_rate": 4.5284676663542644e-05,
2774
+ "loss": 0.1776,
2775
+ "step": 3220
2776
+ },
2777
+ {
2778
+ "epoch": 1.513589503280225,
2779
+ "grad_norm": 1.2433106899261475,
2780
+ "learning_rate": 4.52700328022493e-05,
2781
+ "loss": 0.1626,
2782
+ "step": 3230
2783
+ },
2784
+ {
2785
+ "epoch": 1.5182755388940956,
2786
+ "grad_norm": 1.4370200634002686,
2787
+ "learning_rate": 4.525538894095596e-05,
2788
+ "loss": 0.1752,
2789
+ "step": 3240
2790
+ },
2791
+ {
2792
+ "epoch": 1.522961574507966,
2793
+ "grad_norm": 1.9471467733383179,
2794
+ "learning_rate": 4.524074507966261e-05,
2795
+ "loss": 0.1782,
2796
+ "step": 3250
2797
+ },
2798
+ {
2799
+ "epoch": 1.527647610121837,
2800
+ "grad_norm": 1.829440712928772,
2801
+ "learning_rate": 4.522610121836926e-05,
2802
+ "loss": 0.2014,
2803
+ "step": 3260
2804
+ },
2805
+ {
2806
+ "epoch": 1.5323336457357075,
2807
+ "grad_norm": 1.703355073928833,
2808
+ "learning_rate": 4.5211457357075915e-05,
2809
+ "loss": 0.1762,
2810
+ "step": 3270
2811
+ },
2812
+ {
2813
+ "epoch": 1.5370196813495782,
2814
+ "grad_norm": 1.6706669330596924,
2815
+ "learning_rate": 4.519681349578257e-05,
2816
+ "loss": 0.1937,
2817
+ "step": 3280
2818
+ },
2819
+ {
2820
+ "epoch": 1.541705716963449,
2821
+ "grad_norm": 1.5066584348678589,
2822
+ "learning_rate": 4.5182169634489224e-05,
2823
+ "loss": 0.2008,
2824
+ "step": 3290
2825
+ },
2826
+ {
2827
+ "epoch": 1.5463917525773194,
2828
+ "grad_norm": 1.4767428636550903,
2829
+ "learning_rate": 4.516752577319588e-05,
2830
+ "loss": 0.229,
2831
+ "step": 3300
2832
+ },
2833
+ {
2834
+ "epoch": 1.5463917525773194,
2835
+ "eval_loss": 0.039627715945243835,
2836
+ "eval_pearson_cosine": 0.8169224788587428,
2837
+ "eval_pearson_dot": 0.7529223671879777,
2838
+ "eval_pearson_euclidean": 0.816545384611743,
2839
+ "eval_pearson_manhattan": 0.8176740235496034,
2840
+ "eval_runtime": 6.324,
2841
+ "eval_samples_per_second": 237.191,
2842
+ "eval_spearman_cosine": 0.8199004477023643,
2843
+ "eval_spearman_dot": 0.7498417362110426,
2844
+ "eval_spearman_euclidean": 0.8235117073588528,
2845
+ "eval_spearman_manhattan": 0.8240841209374519,
2846
+ "eval_steps_per_second": 14.864,
2847
+ "step": 3300
2848
+ },
2849
+ {
2850
+ "epoch": 1.5510777881911904,
2851
+ "grad_norm": 1.3127154111862183,
2852
+ "learning_rate": 4.515288191190253e-05,
2853
+ "loss": 0.1849,
2854
+ "step": 3310
2855
+ },
2856
+ {
2857
+ "epoch": 1.5557638238050608,
2858
+ "grad_norm": 0.9424723982810974,
2859
+ "learning_rate": 4.513823805060919e-05,
2860
+ "loss": 0.2011,
2861
+ "step": 3320
2862
+ },
2863
+ {
2864
+ "epoch": 1.5604498594189316,
2865
+ "grad_norm": 1.585274577140808,
2866
+ "learning_rate": 4.512359418931584e-05,
2867
+ "loss": 0.1935,
2868
+ "step": 3330
2869
+ },
2870
+ {
2871
+ "epoch": 1.5651358950328023,
2872
+ "grad_norm": 1.4503992795944214,
2873
+ "learning_rate": 4.5108950328022495e-05,
2874
+ "loss": 0.2189,
2875
+ "step": 3340
2876
+ },
2877
+ {
2878
+ "epoch": 1.569821930646673,
2879
+ "grad_norm": 1.6958515644073486,
2880
+ "learning_rate": 4.509430646672915e-05,
2881
+ "loss": 0.1949,
2882
+ "step": 3350
2883
+ },
2884
+ {
2885
+ "epoch": 1.5745079662605437,
2886
+ "grad_norm": 1.7165809869766235,
2887
+ "learning_rate": 4.5079662605435804e-05,
2888
+ "loss": 0.1897,
2889
+ "step": 3360
2890
+ },
2891
+ {
2892
+ "epoch": 1.5791940018744142,
2893
+ "grad_norm": 1.4036378860473633,
2894
+ "learning_rate": 4.506501874414246e-05,
2895
+ "loss": 0.2173,
2896
+ "step": 3370
2897
+ },
2898
+ {
2899
+ "epoch": 1.5838800374882849,
2900
+ "grad_norm": 1.4913054704666138,
2901
+ "learning_rate": 4.505037488284911e-05,
2902
+ "loss": 0.1931,
2903
+ "step": 3380
2904
+ },
2905
+ {
2906
+ "epoch": 1.5885660731021556,
2907
+ "grad_norm": 1.7645376920700073,
2908
+ "learning_rate": 4.503573102155577e-05,
2909
+ "loss": 0.201,
2910
+ "step": 3390
2911
+ },
2912
+ {
2913
+ "epoch": 1.5932521087160263,
2914
+ "grad_norm": 1.2109887599945068,
2915
+ "learning_rate": 4.502108716026242e-05,
2916
+ "loss": 0.1742,
2917
+ "step": 3400
2918
+ },
2919
+ {
2920
+ "epoch": 1.5932521087160263,
2921
+ "eval_loss": 0.034527041018009186,
2922
+ "eval_pearson_cosine": 0.8244957125194645,
2923
+ "eval_pearson_dot": 0.7646698456086369,
2924
+ "eval_pearson_euclidean": 0.8169295762368449,
2925
+ "eval_pearson_manhattan": 0.8185445852885636,
2926
+ "eval_runtime": 5.9441,
2927
+ "eval_samples_per_second": 252.35,
2928
+ "eval_spearman_cosine": 0.8251998390698203,
2929
+ "eval_spearman_dot": 0.7633691785479392,
2930
+ "eval_spearman_euclidean": 0.8242535656619165,
2931
+ "eval_spearman_manhattan": 0.825256335833904,
2932
+ "eval_steps_per_second": 15.814,
2933
+ "step": 3400
2934
+ },
2935
+ {
2936
+ "epoch": 1.597938144329897,
2937
+ "grad_norm": 0.9772526621818542,
2938
+ "learning_rate": 4.5006443298969075e-05,
2939
+ "loss": 0.1646,
2940
+ "step": 3410
2941
+ },
2942
+ {
2943
+ "epoch": 1.6026241799437675,
2944
+ "grad_norm": 1.2620090246200562,
2945
+ "learning_rate": 4.499179943767573e-05,
2946
+ "loss": 0.1849,
2947
+ "step": 3420
2948
+ },
2949
+ {
2950
+ "epoch": 1.6073102155576382,
2951
+ "grad_norm": 1.5649354457855225,
2952
+ "learning_rate": 4.4977155576382384e-05,
2953
+ "loss": 0.2149,
2954
+ "step": 3430
2955
+ },
2956
+ {
2957
+ "epoch": 1.611996251171509,
2958
+ "grad_norm": 1.3986328840255737,
2959
+ "learning_rate": 4.496251171508904e-05,
2960
+ "loss": 0.1706,
2961
+ "step": 3440
2962
+ },
2963
+ {
2964
+ "epoch": 1.6166822867853796,
2965
+ "grad_norm": 1.0502641201019287,
2966
+ "learning_rate": 4.4947867853795686e-05,
2967
+ "loss": 0.18,
2968
+ "step": 3450
2969
+ },
2970
+ {
2971
+ "epoch": 1.6213683223992503,
2972
+ "grad_norm": 2.1642649173736572,
2973
+ "learning_rate": 4.4933223992502347e-05,
2974
+ "loss": 0.1903,
2975
+ "step": 3460
2976
+ },
2977
+ {
2978
+ "epoch": 1.6260543580131208,
2979
+ "grad_norm": 2.1181936264038086,
2980
+ "learning_rate": 4.4918580131209e-05,
2981
+ "loss": 0.2061,
2982
+ "step": 3470
2983
+ },
2984
+ {
2985
+ "epoch": 1.6307403936269915,
2986
+ "grad_norm": 1.52034592628479,
2987
+ "learning_rate": 4.4903936269915655e-05,
2988
+ "loss": 0.1612,
2989
+ "step": 3480
2990
+ },
2991
+ {
2992
+ "epoch": 1.6354264292408622,
2993
+ "grad_norm": 1.6476225852966309,
2994
+ "learning_rate": 4.488929240862231e-05,
2995
+ "loss": 0.1825,
2996
+ "step": 3490
2997
+ },
2998
+ {
2999
+ "epoch": 1.640112464854733,
3000
+ "grad_norm": 1.46713387966156,
3001
+ "learning_rate": 4.4874648547328964e-05,
3002
+ "loss": 0.1606,
3003
+ "step": 3500
3004
+ },
3005
+ {
3006
+ "epoch": 1.640112464854733,
3007
+ "eval_loss": 0.03454529866576195,
3008
+ "eval_pearson_cosine": 0.8219067533222528,
3009
+ "eval_pearson_dot": 0.7628975832542579,
3010
+ "eval_pearson_euclidean": 0.8128236856339015,
3011
+ "eval_pearson_manhattan": 0.8145631898507872,
3012
+ "eval_runtime": 5.977,
3013
+ "eval_samples_per_second": 250.962,
3014
+ "eval_spearman_cosine": 0.8230006137618159,
3015
+ "eval_spearman_dot": 0.7622148554080955,
3016
+ "eval_spearman_euclidean": 0.8212688136914371,
3017
+ "eval_spearman_manhattan": 0.8222685344671697,
3018
+ "eval_steps_per_second": 15.727,
3019
+ "step": 3500
3020
  }
3021
  ],
3022
  "logging_steps": 10,