robertou2 commited on
Commit
a9f25ff
·
verified ·
1 Parent(s): d8bbaab

Upload folder using huggingface_hub

Browse files
Files changed (5) hide show
  1. adapter_model.safetensors +1 -1
  2. optimizer.pt +1 -1
  3. rng_state.pth +1 -1
  4. scheduler.pt +1 -1
  5. trainer_state.json +7 -1057
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d93ca4eb9edec96116bcbc3b1e81f3a0ba56ed79236db43613c942a41c02c063
3
  size 92309112
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5529bbd8b02900353e5a9edb1b0cc3a12d5828ce3583f1e939e8a5cd7869147d
3
  size 92309112
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e429f800f9a198ef6019ccdc8fa92c738c6868bcea07ecdf471826e9764be8d6
3
  size 184765003
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:abc81f39457613379a6b53d39a8e9a20485a39bc6441c72daab7e852c4611bd3
3
  size 184765003
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d9f086fad67247c070127f3d5f3c07ca470a78f03c26cde9ec189b8223e059e7
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96d2f22d26bc65f3aeedce5509461616d5bf62bde9362cbb9270a9fe00a8d63a
3
  size 14645
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c5a7217cbf6a48b6118e1945e7a873ce9e9505368a7aa0f316b1becdcb2a8301
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1217dd157c01a1c43f8d1f2eafc858dc7730cb63e7c08068881fa71d637b5c4a
3
  size 1465
trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "best_global_step": 247,
3
- "best_metric": 0.0044091795571148396,
4
- "best_model_checkpoint": "/content/drive/MyDrive/lora_model/outputs/task15_microsoft/Phi-4-mini-instruct/checkpoint-240",
5
- "epoch": 13.157894736842104,
6
  "eval_steps": 1,
7
- "global_step": 250,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -2708,1056 +2708,6 @@
2708
  "eval_samples_per_second": 8.823,
2709
  "eval_steps_per_second": 1.176,
2710
  "step": 180
2711
- },
2712
- {
2713
- "epoch": 9.526315789473685,
2714
- "grad_norm": 0.5609657764434814,
2715
- "learning_rate": 0.0001017050638176612,
2716
- "loss": 0.0328,
2717
- "step": 181
2718
- },
2719
- {
2720
- "epoch": 9.526315789473685,
2721
- "eval_loss": 0.03608579561114311,
2722
- "eval_runtime": 3.3972,
2723
- "eval_samples_per_second": 8.831,
2724
- "eval_steps_per_second": 1.177,
2725
- "step": 181
2726
- },
2727
- {
2728
- "epoch": 9.578947368421053,
2729
- "grad_norm": 0.48078685998916626,
2730
- "learning_rate": 9.902775451932386e-05,
2731
- "loss": 0.0216,
2732
- "step": 182
2733
- },
2734
- {
2735
- "epoch": 9.578947368421053,
2736
- "eval_loss": 0.0358748622238636,
2737
- "eval_runtime": 3.3946,
2738
- "eval_samples_per_second": 8.837,
2739
- "eval_steps_per_second": 1.178,
2740
- "step": 182
2741
- },
2742
- {
2743
- "epoch": 9.631578947368421,
2744
- "grad_norm": 0.5188214182853699,
2745
- "learning_rate": 9.637742601134286e-05,
2746
- "loss": 0.0438,
2747
- "step": 183
2748
- },
2749
- {
2750
- "epoch": 9.631578947368421,
2751
- "eval_loss": 0.03486837074160576,
2752
- "eval_runtime": 3.3974,
2753
- "eval_samples_per_second": 8.83,
2754
- "eval_steps_per_second": 1.177,
2755
- "step": 183
2756
- },
2757
- {
2758
- "epoch": 9.68421052631579,
2759
- "grad_norm": 0.7200556993484497,
2760
- "learning_rate": 9.375455194341214e-05,
2761
- "loss": 0.0663,
2762
- "step": 184
2763
- },
2764
- {
2765
- "epoch": 9.68421052631579,
2766
- "eval_loss": 0.03245267644524574,
2767
- "eval_runtime": 3.4008,
2768
- "eval_samples_per_second": 8.822,
2769
- "eval_steps_per_second": 1.176,
2770
- "step": 184
2771
- },
2772
- {
2773
- "epoch": 9.736842105263158,
2774
- "grad_norm": 0.6560045480728149,
2775
- "learning_rate": 9.11596010587441e-05,
2776
- "loss": 0.064,
2777
- "step": 185
2778
- },
2779
- {
2780
- "epoch": 9.736842105263158,
2781
- "eval_loss": 0.029578620567917824,
2782
- "eval_runtime": 3.4019,
2783
- "eval_samples_per_second": 8.819,
2784
- "eval_steps_per_second": 1.176,
2785
- "step": 185
2786
- },
2787
- {
2788
- "epoch": 9.789473684210526,
2789
- "grad_norm": 0.5027221441268921,
2790
- "learning_rate": 8.85930371102994e-05,
2791
- "loss": 0.0416,
2792
- "step": 186
2793
- },
2794
- {
2795
- "epoch": 9.789473684210526,
2796
- "eval_loss": 0.026809442788362503,
2797
- "eval_runtime": 3.4005,
2798
- "eval_samples_per_second": 8.822,
2799
- "eval_steps_per_second": 1.176,
2800
- "step": 186
2801
- },
2802
- {
2803
- "epoch": 9.842105263157894,
2804
- "grad_norm": 0.4124845564365387,
2805
- "learning_rate": 8.605531877790762e-05,
2806
- "loss": 0.0335,
2807
- "step": 187
2808
- },
2809
- {
2810
- "epoch": 9.842105263157894,
2811
- "eval_loss": 0.02500898391008377,
2812
- "eval_runtime": 3.4021,
2813
- "eval_samples_per_second": 8.818,
2814
- "eval_steps_per_second": 1.176,
2815
- "step": 187
2816
- },
2817
- {
2818
- "epoch": 9.894736842105264,
2819
- "grad_norm": 0.5714792013168335,
2820
- "learning_rate": 8.354689958629513e-05,
2821
- "loss": 0.0491,
2822
- "step": 188
2823
- },
2824
- {
2825
- "epoch": 9.894736842105264,
2826
- "eval_loss": 0.022844497114419937,
2827
- "eval_runtime": 3.401,
2828
- "eval_samples_per_second": 8.821,
2829
- "eval_steps_per_second": 1.176,
2830
- "step": 188
2831
- },
2832
- {
2833
- "epoch": 9.947368421052632,
2834
- "grad_norm": 0.48736098408699036,
2835
- "learning_rate": 8.106822782403376e-05,
2836
- "loss": 0.018,
2837
- "step": 189
2838
- },
2839
- {
2840
- "epoch": 9.947368421052632,
2841
- "eval_loss": 0.021435970440506935,
2842
- "eval_runtime": 3.4008,
2843
- "eval_samples_per_second": 8.821,
2844
- "eval_steps_per_second": 1.176,
2845
- "step": 189
2846
- },
2847
- {
2848
- "epoch": 10.0,
2849
- "grad_norm": 0.5927891731262207,
2850
- "learning_rate": 7.861974646342596e-05,
2851
- "loss": 0.0388,
2852
- "step": 190
2853
- },
2854
- {
2855
- "epoch": 10.0,
2856
- "eval_loss": 0.019742580130696297,
2857
- "eval_runtime": 3.4006,
2858
- "eval_samples_per_second": 8.822,
2859
- "eval_steps_per_second": 1.176,
2860
- "step": 190
2861
- },
2862
- {
2863
- "epoch": 10.052631578947368,
2864
- "grad_norm": 0.3376651108264923,
2865
- "learning_rate": 7.620189308133943e-05,
2866
- "loss": 0.0196,
2867
- "step": 191
2868
- },
2869
- {
2870
- "epoch": 10.052631578947368,
2871
- "eval_loss": 0.018559806048870087,
2872
- "eval_runtime": 3.388,
2873
- "eval_samples_per_second": 8.855,
2874
- "eval_steps_per_second": 1.181,
2875
- "step": 191
2876
- },
2877
- {
2878
- "epoch": 10.105263157894736,
2879
- "grad_norm": 0.3613579273223877,
2880
- "learning_rate": 7.381509978100626e-05,
2881
- "loss": 0.0172,
2882
- "step": 192
2883
- },
2884
- {
2885
- "epoch": 10.105263157894736,
2886
- "eval_loss": 0.017322294414043427,
2887
- "eval_runtime": 3.3891,
2888
- "eval_samples_per_second": 8.852,
2889
- "eval_steps_per_second": 1.18,
2890
- "step": 192
2891
- },
2892
- {
2893
- "epoch": 10.157894736842104,
2894
- "grad_norm": 0.2621256411075592,
2895
- "learning_rate": 7.145979311479986e-05,
2896
- "loss": 0.0159,
2897
- "step": 193
2898
- },
2899
- {
2900
- "epoch": 10.157894736842104,
2901
- "eval_loss": 0.016333211213350296,
2902
- "eval_runtime": 3.4014,
2903
- "eval_samples_per_second": 8.82,
2904
- "eval_steps_per_second": 1.176,
2905
- "step": 193
2906
- },
2907
- {
2908
- "epoch": 10.210526315789474,
2909
- "grad_norm": 0.24995078146457672,
2910
- "learning_rate": 6.913639400800489e-05,
2911
- "loss": 0.0132,
2912
- "step": 194
2913
- },
2914
- {
2915
- "epoch": 10.210526315789474,
2916
- "eval_loss": 0.015769897028803825,
2917
- "eval_runtime": 3.401,
2918
- "eval_samples_per_second": 8.821,
2919
- "eval_steps_per_second": 1.176,
2920
- "step": 194
2921
- },
2922
- {
2923
- "epoch": 10.263157894736842,
2924
- "grad_norm": 0.38419196009635925,
2925
- "learning_rate": 6.684531768359173e-05,
2926
- "loss": 0.0196,
2927
- "step": 195
2928
- },
2929
- {
2930
- "epoch": 10.263157894736842,
2931
- "eval_loss": 0.015028283931314945,
2932
- "eval_runtime": 3.4,
2933
- "eval_samples_per_second": 8.824,
2934
- "eval_steps_per_second": 1.176,
2935
- "step": 195
2936
- },
2937
- {
2938
- "epoch": 10.31578947368421,
2939
- "grad_norm": 0.23766584694385529,
2940
- "learning_rate": 6.458697358801061e-05,
2941
- "loss": 0.009,
2942
- "step": 196
2943
- },
2944
- {
2945
- "epoch": 10.31578947368421,
2946
- "eval_loss": 0.014445771463215351,
2947
- "eval_runtime": 3.3979,
2948
- "eval_samples_per_second": 8.829,
2949
- "eval_steps_per_second": 1.177,
2950
- "step": 196
2951
- },
2952
- {
2953
- "epoch": 10.368421052631579,
2954
- "grad_norm": 0.2710660398006439,
2955
- "learning_rate": 6.236176531801813e-05,
2956
- "loss": 0.0096,
2957
- "step": 197
2958
- },
2959
- {
2960
- "epoch": 10.368421052631579,
2961
- "eval_loss": 0.01395699568092823,
2962
- "eval_runtime": 3.3981,
2963
- "eval_samples_per_second": 8.828,
2964
- "eval_steps_per_second": 1.177,
2965
- "step": 197
2966
- },
2967
- {
2968
- "epoch": 10.421052631578947,
2969
- "grad_norm": 0.20278970897197723,
2970
- "learning_rate": 6.017009054854858e-05,
2971
- "loss": 0.0087,
2972
- "step": 198
2973
- },
2974
- {
2975
- "epoch": 10.421052631578947,
2976
- "eval_loss": 0.013656516559422016,
2977
- "eval_runtime": 3.4043,
2978
- "eval_samples_per_second": 8.812,
2979
- "eval_steps_per_second": 1.175,
2980
- "step": 198
2981
- },
2982
- {
2983
- "epoch": 10.473684210526315,
2984
- "grad_norm": 0.3319687247276306,
2985
- "learning_rate": 5.801234096164468e-05,
2986
- "loss": 0.016,
2987
- "step": 199
2988
- },
2989
- {
2990
- "epoch": 10.473684210526315,
2991
- "eval_loss": 0.012863567098975182,
2992
- "eval_runtime": 3.403,
2993
- "eval_samples_per_second": 8.816,
2994
- "eval_steps_per_second": 1.175,
2995
- "step": 199
2996
- },
2997
- {
2998
- "epoch": 10.526315789473685,
2999
- "grad_norm": 0.25473591685295105,
3000
- "learning_rate": 5.58889021764582e-05,
3001
- "loss": 0.0105,
3002
- "step": 200
3003
- },
3004
- {
3005
- "epoch": 10.526315789473685,
3006
- "eval_loss": 0.012198278680443764,
3007
- "eval_runtime": 3.3999,
3008
- "eval_samples_per_second": 8.824,
3009
- "eval_steps_per_second": 1.177,
3010
- "step": 200
3011
- },
3012
- {
3013
- "epoch": 10.578947368421053,
3014
- "grad_norm": 0.3705623745918274,
3015
- "learning_rate": 5.3800153680334754e-05,
3016
- "loss": 0.0134,
3017
- "step": 201
3018
- },
3019
- {
3020
- "epoch": 10.578947368421053,
3021
- "eval_loss": 0.011488989926874638,
3022
- "eval_runtime": 3.3917,
3023
- "eval_samples_per_second": 8.845,
3024
- "eval_steps_per_second": 1.179,
3025
- "step": 201
3026
- },
3027
- {
3028
- "epoch": 10.631578947368421,
3029
- "grad_norm": 0.24455586075782776,
3030
- "learning_rate": 5.17464687609942e-05,
3031
- "loss": 0.0112,
3032
- "step": 202
3033
- },
3034
- {
3035
- "epoch": 10.631578947368421,
3036
- "eval_loss": 0.010651330463588238,
3037
- "eval_runtime": 3.3998,
3038
- "eval_samples_per_second": 8.824,
3039
- "eval_steps_per_second": 1.177,
3040
- "step": 202
3041
- },
3042
- {
3043
- "epoch": 10.68421052631579,
3044
- "grad_norm": 0.2879987955093384,
3045
- "learning_rate": 4.97282144398192e-05,
3046
- "loss": 0.0108,
3047
- "step": 203
3048
- },
3049
- {
3050
- "epoch": 10.68421052631579,
3051
- "eval_loss": 0.010258635506033897,
3052
- "eval_runtime": 3.4041,
3053
- "eval_samples_per_second": 8.813,
3054
- "eval_steps_per_second": 1.175,
3055
- "step": 203
3056
- },
3057
- {
3058
- "epoch": 10.736842105263158,
3059
- "grad_norm": 0.2595934569835663,
3060
- "learning_rate": 4.7745751406263163e-05,
3061
- "loss": 0.0116,
3062
- "step": 204
3063
- },
3064
- {
3065
- "epoch": 10.736842105263158,
3066
- "eval_loss": 0.009770309552550316,
3067
- "eval_runtime": 3.4083,
3068
- "eval_samples_per_second": 8.802,
3069
- "eval_steps_per_second": 1.174,
3070
- "step": 204
3071
- },
3072
- {
3073
- "epoch": 10.789473684210526,
3074
- "grad_norm": 0.3026018738746643,
3075
- "learning_rate": 4.5799433953390616e-05,
3076
- "loss": 0.0116,
3077
- "step": 205
3078
- },
3079
- {
3080
- "epoch": 10.789473684210526,
3081
- "eval_loss": 0.00936987716704607,
3082
- "eval_runtime": 3.4054,
3083
- "eval_samples_per_second": 8.81,
3084
- "eval_steps_per_second": 1.175,
3085
- "step": 205
3086
- },
3087
- {
3088
- "epoch": 10.842105263157894,
3089
- "grad_norm": 0.4068312644958496,
3090
- "learning_rate": 4.388960991455998e-05,
3091
- "loss": 0.0109,
3092
- "step": 206
3093
- },
3094
- {
3095
- "epoch": 10.842105263157894,
3096
- "eval_loss": 0.008922109380364418,
3097
- "eval_runtime": 3.4117,
3098
- "eval_samples_per_second": 8.793,
3099
- "eval_steps_per_second": 1.172,
3100
- "step": 206
3101
- },
3102
- {
3103
- "epoch": 10.894736842105264,
3104
- "grad_norm": 0.3379729688167572,
3105
- "learning_rate": 4.2016620601260796e-05,
3106
- "loss": 0.015,
3107
- "step": 207
3108
- },
3109
- {
3110
- "epoch": 10.894736842105264,
3111
- "eval_loss": 0.008320866152644157,
3112
- "eval_runtime": 3.4039,
3113
- "eval_samples_per_second": 8.813,
3114
- "eval_steps_per_second": 1.175,
3115
- "step": 207
3116
- },
3117
- {
3118
- "epoch": 10.947368421052632,
3119
- "grad_norm": 0.2505350410938263,
3120
- "learning_rate": 4.0180800742117244e-05,
3121
- "loss": 0.008,
3122
- "step": 208
3123
- },
3124
- {
3125
- "epoch": 10.947368421052632,
3126
- "eval_loss": 0.007898358628153801,
3127
- "eval_runtime": 3.3962,
3128
- "eval_samples_per_second": 8.833,
3129
- "eval_steps_per_second": 1.178,
3130
- "step": 208
3131
- },
3132
- {
3133
- "epoch": 11.0,
3134
- "grad_norm": 0.36052215099334717,
3135
- "learning_rate": 3.838247842306716e-05,
3136
- "loss": 0.0133,
3137
- "step": 209
3138
- },
3139
- {
3140
- "epoch": 11.0,
3141
- "eval_loss": 0.007371651474386454,
3142
- "eval_runtime": 3.3973,
3143
- "eval_samples_per_second": 8.831,
3144
- "eval_steps_per_second": 1.177,
3145
- "step": 209
3146
- },
3147
- {
3148
- "epoch": 11.052631578947368,
3149
- "grad_norm": 0.12308855354785919,
3150
- "learning_rate": 3.662197502872885e-05,
3151
- "loss": 0.0051,
3152
- "step": 210
3153
- },
3154
- {
3155
- "epoch": 11.052631578947368,
3156
- "eval_loss": 0.006998243276029825,
3157
- "eval_runtime": 3.4004,
3158
- "eval_samples_per_second": 8.822,
3159
- "eval_steps_per_second": 1.176,
3160
- "step": 210
3161
- },
3162
- {
3163
- "epoch": 11.105263157894736,
3164
- "grad_norm": 0.12299831211566925,
3165
- "learning_rate": 3.489960518496521e-05,
3166
- "loss": 0.0065,
3167
- "step": 211
3168
- },
3169
- {
3170
- "epoch": 11.105263157894736,
3171
- "eval_loss": 0.006782620679587126,
3172
- "eval_runtime": 3.4059,
3173
- "eval_samples_per_second": 8.808,
3174
- "eval_steps_per_second": 1.174,
3175
- "step": 211
3176
- },
3177
- {
3178
- "epoch": 11.157894736842104,
3179
- "grad_norm": 0.12273000180721283,
3180
- "learning_rate": 3.321567670265568e-05,
3181
- "loss": 0.0059,
3182
- "step": 212
3183
- },
3184
- {
3185
- "epoch": 11.157894736842104,
3186
- "eval_loss": 0.006513877771794796,
3187
- "eval_runtime": 3.3943,
3188
- "eval_samples_per_second": 8.838,
3189
- "eval_steps_per_second": 1.178,
3190
- "step": 212
3191
- },
3192
- {
3193
- "epoch": 11.210526315789474,
3194
- "grad_norm": 0.11980213969945908,
3195
- "learning_rate": 3.157049052268662e-05,
3196
- "loss": 0.0051,
3197
- "step": 213
3198
- },
3199
- {
3200
- "epoch": 11.210526315789474,
3201
- "eval_loss": 0.006208530627191067,
3202
- "eval_runtime": 3.4058,
3203
- "eval_samples_per_second": 8.809,
3204
- "eval_steps_per_second": 1.174,
3205
- "step": 213
3206
- },
3207
- {
3208
- "epoch": 11.263157894736842,
3209
- "grad_norm": 0.14820842444896698,
3210
- "learning_rate": 2.9964340662168772e-05,
3211
- "loss": 0.005,
3212
- "step": 214
3213
- },
3214
- {
3215
- "epoch": 11.263157894736842,
3216
- "eval_loss": 0.006144699640572071,
3217
- "eval_runtime": 3.4009,
3218
- "eval_samples_per_second": 8.821,
3219
- "eval_steps_per_second": 1.176,
3220
- "step": 214
3221
- },
3222
- {
3223
- "epoch": 11.31578947368421,
3224
- "grad_norm": 0.09703250229358673,
3225
- "learning_rate": 2.8397514161892484e-05,
3226
- "loss": 0.0047,
3227
- "step": 215
3228
- },
3229
- {
3230
- "epoch": 11.31578947368421,
3231
- "eval_loss": 0.00596656883135438,
3232
- "eval_runtime": 3.4079,
3233
- "eval_samples_per_second": 8.803,
3234
- "eval_steps_per_second": 1.174,
3235
- "step": 215
3236
- },
3237
- {
3238
- "epoch": 11.368421052631579,
3239
- "grad_norm": 0.1398313045501709,
3240
- "learning_rate": 2.687029103502972e-05,
3241
- "loss": 0.0058,
3242
- "step": 216
3243
- },
3244
- {
3245
- "epoch": 11.368421052631579,
3246
- "eval_loss": 0.0058633070439100266,
3247
- "eval_runtime": 3.403,
3248
- "eval_samples_per_second": 8.816,
3249
- "eval_steps_per_second": 1.175,
3250
- "step": 216
3251
- },
3252
- {
3253
- "epoch": 11.421052631578947,
3254
- "grad_norm": 0.12219510972499847,
3255
- "learning_rate": 2.5382944217091723e-05,
3256
- "loss": 0.0059,
3257
- "step": 217
3258
- },
3259
- {
3260
- "epoch": 11.421052631578947,
3261
- "eval_loss": 0.0056641846895217896,
3262
- "eval_runtime": 3.4055,
3263
- "eval_samples_per_second": 8.809,
3264
- "eval_steps_per_second": 1.175,
3265
- "step": 217
3266
- },
3267
- {
3268
- "epoch": 11.473684210526315,
3269
- "grad_norm": 0.10808281600475311,
3270
- "learning_rate": 2.3935739517151916e-05,
3271
- "loss": 0.005,
3272
- "step": 218
3273
- },
3274
- {
3275
- "epoch": 11.473684210526315,
3276
- "eval_loss": 0.005585065111517906,
3277
- "eval_runtime": 3.3987,
3278
- "eval_samples_per_second": 8.827,
3279
- "eval_steps_per_second": 1.177,
3280
- "step": 218
3281
- },
3282
- {
3283
- "epoch": 11.526315789473685,
3284
- "grad_norm": 0.19032533466815948,
3285
- "learning_rate": 2.2528935570342164e-05,
3286
- "loss": 0.0063,
3287
- "step": 219
3288
- },
3289
- {
3290
- "epoch": 11.526315789473685,
3291
- "eval_loss": 0.005458400584757328,
3292
- "eval_runtime": 3.4008,
3293
- "eval_samples_per_second": 8.822,
3294
- "eval_steps_per_second": 1.176,
3295
- "step": 219
3296
- },
3297
- {
3298
- "epoch": 11.578947368421053,
3299
- "grad_norm": 0.09316842257976532,
3300
- "learning_rate": 2.1162783791631057e-05,
3301
- "loss": 0.004,
3302
- "step": 220
3303
- },
3304
- {
3305
- "epoch": 11.578947368421053,
3306
- "eval_loss": 0.0053214430809021,
3307
- "eval_runtime": 3.3972,
3308
- "eval_samples_per_second": 8.831,
3309
- "eval_steps_per_second": 1.177,
3310
- "step": 220
3311
- },
3312
- {
3313
- "epoch": 11.631578947368421,
3314
- "grad_norm": 0.13419128954410553,
3315
- "learning_rate": 1.9837528330892778e-05,
3316
- "loss": 0.0053,
3317
- "step": 221
3318
- },
3319
- {
3320
- "epoch": 11.631578947368421,
3321
- "eval_loss": 0.00523610832169652,
3322
- "eval_runtime": 3.3774,
3323
- "eval_samples_per_second": 8.883,
3324
- "eval_steps_per_second": 1.184,
3325
- "step": 221
3326
- },
3327
- {
3328
- "epoch": 11.68421052631579,
3329
- "grad_norm": 0.1483260989189148,
3330
- "learning_rate": 1.8553406029274188e-05,
3331
- "loss": 0.0063,
3332
- "step": 222
3333
- },
3334
- {
3335
- "epoch": 11.68421052631579,
3336
- "eval_loss": 0.0051864017732441425,
3337
- "eval_runtime": 3.3864,
3338
- "eval_samples_per_second": 8.859,
3339
- "eval_steps_per_second": 1.181,
3340
- "step": 222
3341
- },
3342
- {
3343
- "epoch": 11.736842105263158,
3344
- "grad_norm": 0.15016067028045654,
3345
- "learning_rate": 1.7310646376867885e-05,
3346
- "loss": 0.0067,
3347
- "step": 223
3348
- },
3349
- {
3350
- "epoch": 11.736842105263158,
3351
- "eval_loss": 0.0051628886722028255,
3352
- "eval_runtime": 3.399,
3353
- "eval_samples_per_second": 8.826,
3354
- "eval_steps_per_second": 1.177,
3355
- "step": 223
3356
- },
3357
- {
3358
- "epoch": 11.789473684210526,
3359
- "grad_norm": 0.0965675637125969,
3360
- "learning_rate": 1.6109471471699556e-05,
3361
- "loss": 0.0052,
3362
- "step": 224
3363
- },
3364
- {
3365
- "epoch": 11.789473684210526,
3366
- "eval_loss": 0.005002335179597139,
3367
- "eval_runtime": 3.4012,
3368
- "eval_samples_per_second": 8.82,
3369
- "eval_steps_per_second": 1.176,
3370
- "step": 224
3371
- },
3372
- {
3373
- "epoch": 11.842105263157894,
3374
- "grad_norm": 0.1401059329509735,
3375
- "learning_rate": 1.4950095980035772e-05,
3376
- "loss": 0.0055,
3377
- "step": 225
3378
- },
3379
- {
3380
- "epoch": 11.842105263157894,
3381
- "eval_loss": 0.004974076058715582,
3382
- "eval_runtime": 3.4045,
3383
- "eval_samples_per_second": 8.812,
3384
- "eval_steps_per_second": 1.175,
3385
- "step": 225
3386
- },
3387
- {
3388
- "epoch": 11.894736842105264,
3389
- "grad_norm": 0.08175503462553024,
3390
- "learning_rate": 1.3832727098020331e-05,
3391
- "loss": 0.0037,
3392
- "step": 226
3393
- },
3394
- {
3395
- "epoch": 11.894736842105264,
3396
- "eval_loss": 0.004897472448647022,
3397
- "eval_runtime": 3.4065,
3398
- "eval_samples_per_second": 8.807,
3399
- "eval_steps_per_second": 1.174,
3400
- "step": 226
3401
- },
3402
- {
3403
- "epoch": 11.947368421052632,
3404
- "grad_norm": 0.14667555689811707,
3405
- "learning_rate": 1.2757564514645492e-05,
3406
- "loss": 0.0047,
3407
- "step": 227
3408
- },
3409
- {
3410
- "epoch": 11.947368421052632,
3411
- "eval_loss": 0.004857571795582771,
3412
- "eval_runtime": 3.4021,
3413
- "eval_samples_per_second": 8.818,
3414
- "eval_steps_per_second": 1.176,
3415
- "step": 227
3416
- },
3417
- {
3418
- "epoch": 12.0,
3419
- "grad_norm": 0.07701026648283005,
3420
- "learning_rate": 1.1724800376064798e-05,
3421
- "loss": 0.0036,
3422
- "step": 228
3423
- },
3424
- {
3425
- "epoch": 12.0,
3426
- "eval_loss": 0.004770983941853046,
3427
- "eval_runtime": 3.4001,
3428
- "eval_samples_per_second": 8.823,
3429
- "eval_steps_per_second": 1.176,
3430
- "step": 228
3431
- },
3432
- {
3433
- "epoch": 12.052631578947368,
3434
- "grad_norm": 0.11114013940095901,
3435
- "learning_rate": 1.0734619251253963e-05,
3436
- "loss": 0.0057,
3437
- "step": 229
3438
- },
3439
- {
3440
- "epoch": 12.052631578947368,
3441
- "eval_loss": 0.004740286152809858,
3442
- "eval_runtime": 3.4009,
3443
- "eval_samples_per_second": 8.821,
3444
- "eval_steps_per_second": 1.176,
3445
- "step": 229
3446
- },
3447
- {
3448
- "epoch": 12.105263157894736,
3449
- "grad_norm": 0.07092595100402832,
3450
- "learning_rate": 9.78719809902598e-06,
3451
- "loss": 0.0035,
3452
- "step": 230
3453
- },
3454
- {
3455
- "epoch": 12.105263157894736,
3456
- "eval_loss": 0.004716214258223772,
3457
- "eval_runtime": 3.4053,
3458
- "eval_samples_per_second": 8.81,
3459
- "eval_steps_per_second": 1.175,
3460
- "step": 230
3461
- },
3462
- {
3463
- "epoch": 12.157894736842104,
3464
- "grad_norm": 0.12435787171125412,
3465
- "learning_rate": 8.882706236405884e-06,
3466
- "loss": 0.0054,
3467
- "step": 231
3468
- },
3469
- {
3470
- "epoch": 12.157894736842104,
3471
- "eval_loss": 0.004733518231660128,
3472
- "eval_runtime": 3.3993,
3473
- "eval_samples_per_second": 8.825,
3474
- "eval_steps_per_second": 1.177,
3475
- "step": 231
3476
- },
3477
- {
3478
- "epoch": 12.210526315789474,
3479
- "grad_norm": 0.12049361318349838,
3480
- "learning_rate": 8.02130530837189e-06,
3481
- "loss": 0.0053,
3482
- "step": 232
3483
- },
3484
- {
3485
- "epoch": 12.210526315789474,
3486
- "eval_loss": 0.004637454636394978,
3487
- "eval_runtime": 3.4013,
3488
- "eval_samples_per_second": 8.82,
3489
- "eval_steps_per_second": 1.176,
3490
- "step": 232
3491
- },
3492
- {
3493
- "epoch": 12.263157894736842,
3494
- "grad_norm": 0.06943191587924957,
3495
- "learning_rate": 7.203149258967034e-06,
3496
- "loss": 0.0039,
3497
- "step": 233
3498
- },
3499
- {
3500
- "epoch": 12.263157894736842,
3501
- "eval_loss": 0.004599397070705891,
3502
- "eval_runtime": 3.4029,
3503
- "eval_samples_per_second": 8.816,
3504
- "eval_steps_per_second": 1.175,
3505
- "step": 233
3506
- },
3507
- {
3508
- "epoch": 12.31578947368421,
3509
- "grad_norm": 0.10378482937812805,
3510
- "learning_rate": 6.428384303787282e-06,
3511
- "loss": 0.0053,
3512
- "step": 234
3513
- },
3514
- {
3515
- "epoch": 12.31578947368421,
3516
- "eval_loss": 0.0046176365576684475,
3517
- "eval_runtime": 3.4039,
3518
- "eval_samples_per_second": 8.813,
3519
- "eval_steps_per_second": 1.175,
3520
- "step": 234
3521
- },
3522
- {
3523
- "epoch": 12.368421052631579,
3524
- "grad_norm": 0.08170512318611145,
3525
- "learning_rate": 5.697148903850868e-06,
3526
- "loss": 0.0046,
3527
- "step": 235
3528
- },
3529
- {
3530
- "epoch": 12.368421052631579,
3531
- "eval_loss": 0.00459822965785861,
3532
- "eval_runtime": 3.404,
3533
- "eval_samples_per_second": 8.813,
3534
- "eval_steps_per_second": 1.175,
3535
- "step": 235
3536
- },
3537
- {
3538
- "epoch": 12.421052631578947,
3539
- "grad_norm": 0.09477739036083221,
3540
- "learning_rate": 5.009573740853312e-06,
3541
- "loss": 0.0047,
3542
- "step": 236
3543
- },
3544
- {
3545
- "epoch": 12.421052631578947,
3546
- "eval_loss": 0.004573486745357513,
3547
- "eval_runtime": 3.4032,
3548
- "eval_samples_per_second": 8.815,
3549
- "eval_steps_per_second": 1.175,
3550
- "step": 236
3551
- },
3552
- {
3553
- "epoch": 12.473684210526315,
3554
- "grad_norm": 0.0745476633310318,
3555
- "learning_rate": 4.365781693813048e-06,
3556
- "loss": 0.004,
3557
- "step": 237
3558
- },
3559
- {
3560
- "epoch": 12.473684210526315,
3561
- "eval_loss": 0.004487224388867617,
3562
- "eval_runtime": 3.4067,
3563
- "eval_samples_per_second": 8.806,
3564
- "eval_steps_per_second": 1.174,
3565
- "step": 237
3566
- },
3567
- {
3568
- "epoch": 12.526315789473685,
3569
- "grad_norm": 0.13931944966316223,
3570
- "learning_rate": 3.765887817111069e-06,
3571
- "loss": 0.0065,
3572
- "step": 238
3573
- },
3574
- {
3575
- "epoch": 12.526315789473685,
3576
- "eval_loss": 0.004524969030171633,
3577
- "eval_runtime": 3.4058,
3578
- "eval_samples_per_second": 8.808,
3579
- "eval_steps_per_second": 1.174,
3580
- "step": 238
3581
- },
3582
- {
3583
- "epoch": 12.578947368421053,
3584
- "grad_norm": 0.056376032531261444,
3585
- "learning_rate": 3.2099993199292688e-06,
3586
- "loss": 0.0026,
3587
- "step": 239
3588
- },
3589
- {
3590
- "epoch": 12.578947368421053,
3591
- "eval_loss": 0.0044847470708191395,
3592
- "eval_runtime": 3.3996,
3593
- "eval_samples_per_second": 8.825,
3594
- "eval_steps_per_second": 1.177,
3595
- "step": 239
3596
- },
3597
- {
3598
- "epoch": 12.631578947368421,
3599
- "grad_norm": 0.07375714182853699,
3600
- "learning_rate": 2.698215547090599e-06,
3601
- "loss": 0.004,
3602
- "step": 240
3603
- },
3604
- {
3605
- "epoch": 12.631578947368421,
3606
- "eval_loss": 0.004458704963326454,
3607
- "eval_runtime": 3.3998,
3608
- "eval_samples_per_second": 8.824,
3609
- "eval_steps_per_second": 1.177,
3610
- "step": 240
3611
- },
3612
- {
3613
- "epoch": 12.68421052631579,
3614
- "grad_norm": 0.06447097659111023,
3615
- "learning_rate": 2.230627961304993e-06,
3616
- "loss": 0.0032,
3617
- "step": 241
3618
- },
3619
- {
3620
- "epoch": 12.68421052631579,
3621
- "eval_loss": 0.0044786701910197735,
3622
- "eval_runtime": 3.3934,
3623
- "eval_samples_per_second": 8.841,
3624
- "eval_steps_per_second": 1.179,
3625
- "step": 241
3626
- },
3627
- {
3628
- "epoch": 12.736842105263158,
3629
- "grad_norm": 0.1086612269282341,
3630
- "learning_rate": 1.807320126823414e-06,
3631
- "loss": 0.0042,
3632
- "step": 242
3633
- },
3634
- {
3635
- "epoch": 12.736842105263158,
3636
- "eval_loss": 0.004519260488450527,
3637
- "eval_runtime": 3.3977,
3638
- "eval_samples_per_second": 8.83,
3639
- "eval_steps_per_second": 1.177,
3640
- "step": 242
3641
- },
3642
- {
3643
- "epoch": 12.789473684210526,
3644
- "grad_norm": 0.052398040890693665,
3645
- "learning_rate": 1.4283676945041346e-06,
3646
- "loss": 0.0024,
3647
- "step": 243
3648
- },
3649
- {
3650
- "epoch": 12.789473684210526,
3651
- "eval_loss": 0.004430453758686781,
3652
- "eval_runtime": 3.4008,
3653
- "eval_samples_per_second": 8.821,
3654
- "eval_steps_per_second": 1.176,
3655
- "step": 243
3656
- },
3657
- {
3658
- "epoch": 12.842105263157894,
3659
- "grad_norm": 0.10231564193964005,
3660
- "learning_rate": 1.0938383882926617e-06,
3661
- "loss": 0.003,
3662
- "step": 244
3663
- },
3664
- {
3665
- "epoch": 12.842105263157894,
3666
- "eval_loss": 0.0044572907499969006,
3667
- "eval_runtime": 3.4036,
3668
- "eval_samples_per_second": 8.814,
3669
- "eval_steps_per_second": 1.175,
3670
- "step": 244
3671
- },
3672
- {
3673
- "epoch": 12.894736842105264,
3674
- "grad_norm": 0.1136302798986435,
3675
- "learning_rate": 8.037919931187243e-07,
3676
- "loss": 0.0028,
3677
- "step": 245
3678
- },
3679
- {
3680
- "epoch": 12.894736842105264,
3681
- "eval_loss": 0.0044529978185892105,
3682
- "eval_runtime": 3.4025,
3683
- "eval_samples_per_second": 8.817,
3684
- "eval_steps_per_second": 1.176,
3685
- "step": 245
3686
- },
3687
- {
3688
- "epoch": 12.947368421052632,
3689
- "grad_norm": 0.08841534703969955,
3690
- "learning_rate": 5.582803442117091e-07,
3691
- "loss": 0.0034,
3692
- "step": 246
3693
- },
3694
- {
3695
- "epoch": 12.947368421052632,
3696
- "eval_loss": 0.004437682218849659,
3697
- "eval_runtime": 3.3982,
3698
- "eval_samples_per_second": 8.828,
3699
- "eval_steps_per_second": 1.177,
3700
- "step": 246
3701
- },
3702
- {
3703
- "epoch": 13.0,
3704
- "grad_norm": 0.09434516727924347,
3705
- "learning_rate": 3.5734731783715333e-07,
3706
- "loss": 0.0051,
3707
- "step": 247
3708
- },
3709
- {
3710
- "epoch": 13.0,
3711
- "eval_loss": 0.0044091795571148396,
3712
- "eval_runtime": 3.4027,
3713
- "eval_samples_per_second": 8.817,
3714
- "eval_steps_per_second": 1.176,
3715
- "step": 247
3716
- },
3717
- {
3718
- "epoch": 13.052631578947368,
3719
- "grad_norm": 0.11519359052181244,
3720
- "learning_rate": 2.0102882345540696e-07,
3721
- "loss": 0.0041,
3722
- "step": 248
3723
- },
3724
- {
3725
- "epoch": 13.052631578947368,
3726
- "eval_loss": 0.004471189342439175,
3727
- "eval_runtime": 3.3961,
3728
- "eval_samples_per_second": 8.834,
3729
- "eval_steps_per_second": 1.178,
3730
- "step": 248
3731
- },
3732
- {
3733
- "epoch": 13.105263157894736,
3734
- "grad_norm": 0.054617173969745636,
3735
- "learning_rate": 8.935279730407086e-08,
3736
- "loss": 0.0026,
3737
- "step": 249
3738
- },
3739
- {
3740
- "epoch": 13.105263157894736,
3741
- "eval_loss": 0.004416502080857754,
3742
- "eval_runtime": 3.4007,
3743
- "eval_samples_per_second": 8.822,
3744
- "eval_steps_per_second": 1.176,
3745
- "step": 249
3746
- },
3747
- {
3748
- "epoch": 13.157894736842104,
3749
- "grad_norm": 0.0668402761220932,
3750
- "learning_rate": 2.2339197405490953e-08,
3751
- "loss": 0.0035,
3752
- "step": 250
3753
- },
3754
- {
3755
- "epoch": 13.157894736842104,
3756
- "eval_loss": 0.004414246417582035,
3757
- "eval_runtime": 3.3991,
3758
- "eval_samples_per_second": 8.826,
3759
- "eval_steps_per_second": 1.177,
3760
- "step": 250
3761
  }
3762
  ],
3763
  "logging_steps": 1,
@@ -3772,12 +2722,12 @@
3772
  "should_evaluate": false,
3773
  "should_log": false,
3774
  "should_save": true,
3775
- "should_training_stop": true
3776
  },
3777
  "attributes": {}
3778
  }
3779
  },
3780
- "total_flos": 1.0147096033671168e+16,
3781
  "train_batch_size": 1,
3782
  "trial_name": null,
3783
  "trial_params": null
 
1
  {
2
+ "best_global_step": 180,
3
+ "best_metric": 0.037015657871961594,
4
+ "best_model_checkpoint": "/content/drive/MyDrive/lora_model/outputs/task15_microsoft/Phi-4-mini-instruct/checkpoint-180",
5
+ "epoch": 9.473684210526315,
6
  "eval_steps": 1,
7
+ "global_step": 180,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
2708
  "eval_samples_per_second": 8.823,
2709
  "eval_steps_per_second": 1.176,
2710
  "step": 180
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2711
  }
2712
  ],
2713
  "logging_steps": 1,
 
2722
  "should_evaluate": false,
2723
  "should_log": false,
2724
  "should_save": true,
2725
+ "should_training_stop": false
2726
  },
2727
  "attributes": {}
2728
  }
2729
  },
2730
+ "total_flos": 7311440876433408.0,
2731
  "train_batch_size": 1,
2732
  "trial_name": null,
2733
  "trial_params": null