craa commited on
Commit
d2f9915
·
verified ·
1 Parent(s): e03caa5

Model save

Browse files
README.md CHANGED
@@ -16,8 +16,8 @@ should probably proofread and complete it, then remove this comment. -->
16
 
17
  This model is a fine-tuned version of [](https://huggingface.co/) on an unknown dataset.
18
  It achieves the following results on the evaluation set:
19
- - Loss: 3.4634
20
- - Accuracy: 0.3762
21
 
22
  ## Model description
23
 
@@ -142,9 +142,9 @@ The following hyperparameters were used during training:
142
  | 3.1362 | 9.7035 | 90000 | 0.3942 | 3.3025 |
143
  | 3.1261 | 9.8113 | 91000 | 0.3944 | 3.3008 |
144
  | 3.1183 | 9.9191 | 92000 | 0.3945 | 3.2993 |
145
- | 3.3291 | 10.0270 | 93000 | 3.4620 | 0.3781 |
146
- | 3.3628 | 10.1348 | 94000 | 3.4687 | 0.3759 |
147
- | 3.3893 | 10.2426 | 95000 | 3.4634 | 0.3762 |
148
 
149
 
150
  ### Framework versions
 
16
 
17
  This model is a fine-tuned version of [](https://huggingface.co/) on an unknown dataset.
18
  It achieves the following results on the evaluation set:
19
+ - Loss: 3.4643
20
+ - Accuracy: 0.3759
21
 
22
  ## Model description
23
 
 
142
  | 3.1362 | 9.7035 | 90000 | 0.3942 | 3.3025 |
143
  | 3.1261 | 9.8113 | 91000 | 0.3944 | 3.3008 |
144
  | 3.1183 | 9.9191 | 92000 | 0.3945 | 3.2993 |
145
+ | 3.3293 | 10.0270 | 93000 | 3.4597 | 0.3782 |
146
+ | 3.364 | 10.1348 | 94000 | 3.4676 | 0.3759 |
147
+ | 3.3881 | 10.2426 | 95000 | 3.4643 | 0.3759 |
148
 
149
 
150
  ### Framework versions
all_results.json CHANGED
@@ -2,15 +2,15 @@
2
  "epoch": 10.242587601078167,
3
  "eval_accuracy": 0.3942060412961377,
4
  "eval_loss": 3.302459716796875,
5
- "eval_runtime": 87.4912,
6
  "eval_samples": 18011,
7
- "eval_samples_per_second": 205.861,
8
- "eval_steps_per_second": 12.87,
9
  "perplexity": 27.179410419842576,
10
  "total_flos": 7.94262454272e+17,
11
- "train_loss": 0.07933730388440584,
12
- "train_runtime": 802.2542,
13
  "train_samples": 296775,
14
- "train_samples_per_second": 18496.319,
15
- "train_steps_per_second": 578.059
16
  }
 
2
  "epoch": 10.242587601078167,
3
  "eval_accuracy": 0.3942060412961377,
4
  "eval_loss": 3.302459716796875,
5
+ "eval_runtime": 97.9886,
6
  "eval_samples": 18011,
7
+ "eval_samples_per_second": 183.807,
8
+ "eval_steps_per_second": 11.491,
9
  "perplexity": 27.179410419842576,
10
  "total_flos": 7.94262454272e+17,
11
+ "train_loss": 0.07933711596037211,
12
+ "train_runtime": 850.9952,
13
  "train_samples": 296775,
14
+ "train_samples_per_second": 17436.937,
15
+ "train_steps_per_second": 544.95
16
  }
eval_results.json CHANGED
@@ -2,9 +2,9 @@
2
  "epoch": 10.242587601078167,
3
  "eval_accuracy": 0.3942060412961377,
4
  "eval_loss": 3.302459716796875,
5
- "eval_runtime": 87.4912,
6
  "eval_samples": 18011,
7
- "eval_samples_per_second": 205.861,
8
- "eval_steps_per_second": 12.87,
9
  "perplexity": 27.179410419842576
10
  }
 
2
  "epoch": 10.242587601078167,
3
  "eval_accuracy": 0.3942060412961377,
4
  "eval_loss": 3.302459716796875,
5
+ "eval_runtime": 97.9886,
6
  "eval_samples": 18011,
7
+ "eval_samples_per_second": 183.807,
8
+ "eval_steps_per_second": 11.491,
9
  "perplexity": 27.179410419842576
10
  }
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 10.242587601078167,
3
  "total_flos": 7.94262454272e+17,
4
- "train_loss": 0.07933730388440584,
5
- "train_runtime": 802.2542,
6
  "train_samples": 296775,
7
- "train_samples_per_second": 18496.319,
8
- "train_steps_per_second": 578.059
9
  }
 
1
  {
2
  "epoch": 10.242587601078167,
3
  "total_flos": 7.94262454272e+17,
4
+ "train_loss": 0.07933711596037211,
5
+ "train_runtime": 850.9952,
6
  "train_samples": 296775,
7
+ "train_samples_per_second": 17436.937,
8
+ "train_steps_per_second": 544.95
9
  }
trainer_state.json CHANGED
@@ -13823,354 +13823,354 @@
13823
  },
13824
  {
13825
  "epoch": 10.005390835579515,
13826
- "grad_norm": 0.9870104789733887,
13827
  "learning_rate": 0.0004800828210934972,
13828
  "loss": 3.1549,
13829
  "step": 92800
13830
  },
13831
  {
13832
  "epoch": 10.01078167115903,
13833
- "grad_norm": 1.0278774499893188,
13834
  "learning_rate": 0.00048001811711420253,
13835
  "loss": 3.2322,
13836
  "step": 92850
13837
  },
13838
  {
13839
  "epoch": 10.016172506738544,
13840
- "grad_norm": 1.0414958000183105,
13841
  "learning_rate": 0.00047995341313490777,
13842
  "loss": 3.2464,
13843
  "step": 92900
13844
  },
13845
  {
13846
  "epoch": 10.021563342318059,
13847
- "grad_norm": 1.038140058517456,
13848
  "learning_rate": 0.000479888709155613,
13849
- "loss": 3.258,
13850
  "step": 92950
13851
  },
13852
  {
13853
  "epoch": 10.026954177897574,
13854
- "grad_norm": 1.06088387966156,
13855
  "learning_rate": 0.00047982400517631826,
13856
  "loss": 3.3291,
13857
  "step": 93000
13858
  },
13859
  {
13860
  "epoch": 10.026954177897574,
13861
- "eval_accuracy": 0.37815790111305103,
13862
- "eval_loss": 3.461848497390747,
13863
- "eval_runtime": 89.3271,
13864
- "eval_samples_per_second": 201.63,
13865
- "eval_steps_per_second": 12.605,
13866
  "step": 93000
13867
  },
13868
  {
13869
  "epoch": 10.032345013477089,
13870
- "grad_norm": 1.1362773180007935,
13871
  "learning_rate": 0.0004797593011970236,
13872
  "loss": 3.3166,
13873
  "step": 93050
13874
  },
13875
  {
13876
  "epoch": 10.037735849056604,
13877
- "grad_norm": 0.9830957651138306,
13878
  "learning_rate": 0.00047969459721772885,
13879
- "loss": 3.3072,
13880
  "step": 93100
13881
  },
13882
  {
13883
  "epoch": 10.04312668463612,
13884
- "grad_norm": 0.9463051557540894,
13885
  "learning_rate": 0.0004796298932384341,
13886
  "loss": 3.3084,
13887
  "step": 93150
13888
  },
13889
  {
13890
  "epoch": 10.048517520215633,
13891
- "grad_norm": 1.0279018878936768,
13892
  "learning_rate": 0.00047956518925913944,
13893
  "loss": 3.3284,
13894
  "step": 93200
13895
  },
13896
  {
13897
  "epoch": 10.053908355795148,
13898
- "grad_norm": 0.9524244666099548,
13899
  "learning_rate": 0.0004795004852798447,
13900
  "loss": 3.3278,
13901
  "step": 93250
13902
  },
13903
  {
13904
  "epoch": 10.059299191374663,
13905
- "grad_norm": 0.9707924127578735,
13906
  "learning_rate": 0.0004794357813005499,
13907
  "loss": 3.3455,
13908
  "step": 93300
13909
  },
13910
  {
13911
  "epoch": 10.064690026954178,
13912
- "grad_norm": 1.0046107769012451,
13913
  "learning_rate": 0.00047937107732125517,
13914
  "loss": 3.3333,
13915
  "step": 93350
13916
  },
13917
  {
13918
  "epoch": 10.070080862533693,
13919
- "grad_norm": 0.9432587623596191,
13920
  "learning_rate": 0.0004793063733419605,
13921
  "loss": 3.3459,
13922
  "step": 93400
13923
  },
13924
  {
13925
  "epoch": 10.075471698113208,
13926
- "grad_norm": 0.957824170589447,
13927
  "learning_rate": 0.00047924166936266576,
13928
- "loss": 3.3448,
13929
  "step": 93450
13930
  },
13931
  {
13932
  "epoch": 10.080862533692722,
13933
- "grad_norm": 0.982549250125885,
13934
  "learning_rate": 0.000479176965383371,
13935
  "loss": 3.3452,
13936
  "step": 93500
13937
  },
13938
  {
13939
  "epoch": 10.086253369272237,
13940
- "grad_norm": 0.9366627931594849,
13941
  "learning_rate": 0.0004791122614040763,
13942
- "loss": 3.374,
13943
  "step": 93550
13944
  },
13945
  {
13946
  "epoch": 10.091644204851752,
13947
- "grad_norm": 1.0103267431259155,
13948
  "learning_rate": 0.0004790475574247816,
13949
- "loss": 3.3542,
13950
  "step": 93600
13951
  },
13952
  {
13953
  "epoch": 10.097035040431267,
13954
- "grad_norm": 0.9215944409370422,
13955
  "learning_rate": 0.00047898285344548684,
13956
- "loss": 3.347,
13957
  "step": 93650
13958
  },
13959
  {
13960
  "epoch": 10.102425876010782,
13961
- "grad_norm": 0.8961822986602783,
13962
  "learning_rate": 0.00047891814946619213,
13963
- "loss": 3.3744,
13964
  "step": 93700
13965
  },
13966
  {
13967
  "epoch": 10.107816711590296,
13968
- "grad_norm": 0.9416046142578125,
13969
  "learning_rate": 0.0004788534454868974,
13970
  "loss": 3.358,
13971
  "step": 93750
13972
  },
13973
  {
13974
  "epoch": 10.11320754716981,
13975
- "grad_norm": 1.0150768756866455,
13976
  "learning_rate": 0.00047878874150760267,
13977
- "loss": 3.3535,
13978
  "step": 93800
13979
  },
13980
  {
13981
  "epoch": 10.118598382749326,
13982
- "grad_norm": 0.946563720703125,
13983
  "learning_rate": 0.00047872403752830797,
13984
- "loss": 3.3513,
13985
  "step": 93850
13986
  },
13987
  {
13988
  "epoch": 10.123989218328841,
13989
- "grad_norm": 0.8931189179420471,
13990
  "learning_rate": 0.0004786593335490132,
13991
- "loss": 3.3631,
13992
  "step": 93900
13993
  },
13994
  {
13995
  "epoch": 10.129380053908356,
13996
- "grad_norm": 0.8984525799751282,
13997
  "learning_rate": 0.0004785946295697185,
13998
- "loss": 3.3789,
13999
  "step": 93950
14000
  },
14001
  {
14002
  "epoch": 10.134770889487871,
14003
- "grad_norm": 0.9255710244178772,
14004
  "learning_rate": 0.0004785299255904238,
14005
- "loss": 3.3625,
14006
  "step": 94000
14007
  },
14008
  {
14009
  "epoch": 10.134770889487871,
14010
- "eval_accuracy": 0.3759518128788658,
14011
- "eval_loss": 3.4679677486419678,
14012
- "eval_runtime": 88.0975,
14013
- "eval_samples_per_second": 204.444,
14014
- "eval_steps_per_second": 12.781,
14015
  "step": 94000
14016
  },
14017
  {
14018
  "epoch": 10.140161725067385,
14019
- "grad_norm": 0.9162030220031738,
14020
  "learning_rate": 0.00047846522161112905,
14021
- "loss": 3.3834,
14022
  "step": 94050
14023
  },
14024
  {
14025
  "epoch": 10.1455525606469,
14026
- "grad_norm": 0.8386930823326111,
14027
  "learning_rate": 0.0004784005176318343,
14028
- "loss": 3.3664,
14029
  "step": 94100
14030
  },
14031
  {
14032
  "epoch": 10.150943396226415,
14033
- "grad_norm": 0.8516772985458374,
14034
  "learning_rate": 0.00047833581365253964,
14035
- "loss": 3.3489,
14036
  "step": 94150
14037
  },
14038
  {
14039
  "epoch": 10.15633423180593,
14040
- "grad_norm": 0.8449950814247131,
14041
  "learning_rate": 0.0004782711096732449,
14042
- "loss": 3.3772,
14043
  "step": 94200
14044
  },
14045
  {
14046
  "epoch": 10.161725067385445,
14047
- "grad_norm": 0.833962082862854,
14048
  "learning_rate": 0.0004782064056939501,
14049
- "loss": 3.3759,
14050
  "step": 94250
14051
  },
14052
  {
14053
  "epoch": 10.167115902964959,
14054
- "grad_norm": 0.9096741080284119,
14055
  "learning_rate": 0.00047814170171465536,
14056
- "loss": 3.3659,
14057
  "step": 94300
14058
  },
14059
  {
14060
  "epoch": 10.172506738544474,
14061
- "grad_norm": 0.8238817453384399,
14062
  "learning_rate": 0.0004780769977353607,
14063
  "loss": 3.384,
14064
  "step": 94350
14065
  },
14066
  {
14067
  "epoch": 10.177897574123989,
14068
- "grad_norm": 0.8702091574668884,
14069
  "learning_rate": 0.00047801229375606596,
14070
- "loss": 3.361,
14071
  "step": 94400
14072
  },
14073
  {
14074
  "epoch": 10.183288409703504,
14075
- "grad_norm": 0.9051607847213745,
14076
  "learning_rate": 0.0004779475897767712,
14077
- "loss": 3.3789,
14078
  "step": 94450
14079
  },
14080
  {
14081
  "epoch": 10.18867924528302,
14082
- "grad_norm": 0.8540977835655212,
14083
  "learning_rate": 0.00047788288579747655,
14084
- "loss": 3.3747,
14085
  "step": 94500
14086
  },
14087
  {
14088
  "epoch": 10.194070080862534,
14089
- "grad_norm": 0.8740184307098389,
14090
  "learning_rate": 0.0004778181818181818,
14091
- "loss": 3.3875,
14092
  "step": 94550
14093
  },
14094
  {
14095
  "epoch": 10.199460916442048,
14096
- "grad_norm": 0.8435526490211487,
14097
  "learning_rate": 0.00047775347783888703,
14098
- "loss": 3.3805,
14099
  "step": 94600
14100
  },
14101
  {
14102
  "epoch": 10.204851752021563,
14103
- "grad_norm": 0.9716430902481079,
14104
  "learning_rate": 0.0004776887738595923,
14105
- "loss": 3.3824,
14106
  "step": 94650
14107
  },
14108
  {
14109
  "epoch": 10.210242587601078,
14110
- "grad_norm": 0.8948441743850708,
14111
  "learning_rate": 0.0004776240698802976,
14112
- "loss": 3.3949,
14113
  "step": 94700
14114
  },
14115
  {
14116
  "epoch": 10.215633423180593,
14117
- "grad_norm": 0.845777153968811,
14118
  "learning_rate": 0.00047755936590100287,
14119
- "loss": 3.4005,
14120
  "step": 94750
14121
  },
14122
  {
14123
  "epoch": 10.221024258760108,
14124
- "grad_norm": 0.7928900122642517,
14125
  "learning_rate": 0.0004774946619217081,
14126
- "loss": 3.388,
14127
  "step": 94800
14128
  },
14129
  {
14130
  "epoch": 10.226415094339623,
14131
- "grad_norm": 0.8383722901344299,
14132
  "learning_rate": 0.0004774299579424134,
14133
  "loss": 3.3848,
14134
  "step": 94850
14135
  },
14136
  {
14137
  "epoch": 10.231805929919137,
14138
- "grad_norm": 0.8034083247184753,
14139
  "learning_rate": 0.0004773652539631187,
14140
- "loss": 3.387,
14141
  "step": 94900
14142
  },
14143
  {
14144
  "epoch": 10.237196765498652,
14145
- "grad_norm": 0.8286055326461792,
14146
  "learning_rate": 0.00047730054998382395,
14147
- "loss": 3.3888,
14148
  "step": 94950
14149
  },
14150
  {
14151
  "epoch": 10.242587601078167,
14152
- "grad_norm": 0.8412846326828003,
14153
  "learning_rate": 0.00047723584600452924,
14154
- "loss": 3.3897,
14155
  "step": 95000
14156
  },
14157
  {
14158
  "epoch": 10.242587601078167,
14159
- "eval_accuracy": 0.3762232277926264,
14160
- "eval_loss": 3.462782621383667,
14161
- "eval_runtime": 88.0025,
14162
- "eval_samples_per_second": 204.665,
14163
- "eval_steps_per_second": 12.795,
14164
  "step": 95000
14165
  },
14166
  {
14167
  "epoch": 10.242587601078167,
14168
  "step": 95000,
14169
  "total_flos": 7.94262454272e+17,
14170
- "train_loss": 0.07933730388440584,
14171
- "train_runtime": 802.2542,
14172
- "train_samples_per_second": 18496.319,
14173
- "train_steps_per_second": 578.059
14174
  }
14175
  ],
14176
  "logging_steps": 50,
 
13823
  },
13824
  {
13825
  "epoch": 10.005390835579515,
13826
+ "grad_norm": 0.9861836433410645,
13827
  "learning_rate": 0.0004800828210934972,
13828
  "loss": 3.1549,
13829
  "step": 92800
13830
  },
13831
  {
13832
  "epoch": 10.01078167115903,
13833
+ "grad_norm": 1.027587652206421,
13834
  "learning_rate": 0.00048001811711420253,
13835
  "loss": 3.2322,
13836
  "step": 92850
13837
  },
13838
  {
13839
  "epoch": 10.016172506738544,
13840
+ "grad_norm": 1.0409610271453857,
13841
  "learning_rate": 0.00047995341313490777,
13842
  "loss": 3.2464,
13843
  "step": 92900
13844
  },
13845
  {
13846
  "epoch": 10.021563342318059,
13847
+ "grad_norm": 1.0378363132476807,
13848
  "learning_rate": 0.000479888709155613,
13849
+ "loss": 3.2581,
13850
  "step": 92950
13851
  },
13852
  {
13853
  "epoch": 10.026954177897574,
13854
+ "grad_norm": 1.0601555109024048,
13855
  "learning_rate": 0.00047982400517631826,
13856
  "loss": 3.3291,
13857
  "step": 93000
13858
  },
13859
  {
13860
  "epoch": 10.026954177897574,
13861
+ "eval_accuracy": 0.37814823100603556,
13862
+ "eval_loss": 3.461963415145874,
13863
+ "eval_runtime": 97.7123,
13864
+ "eval_samples_per_second": 184.327,
13865
+ "eval_steps_per_second": 11.524,
13866
  "step": 93000
13867
  },
13868
  {
13869
  "epoch": 10.032345013477089,
13870
+ "grad_norm": 1.1365256309509277,
13871
  "learning_rate": 0.0004797593011970236,
13872
  "loss": 3.3166,
13873
  "step": 93050
13874
  },
13875
  {
13876
  "epoch": 10.037735849056604,
13877
+ "grad_norm": 0.98159259557724,
13878
  "learning_rate": 0.00047969459721772885,
13879
+ "loss": 3.3071,
13880
  "step": 93100
13881
  },
13882
  {
13883
  "epoch": 10.04312668463612,
13884
+ "grad_norm": 0.9453141689300537,
13885
  "learning_rate": 0.0004796298932384341,
13886
  "loss": 3.3084,
13887
  "step": 93150
13888
  },
13889
  {
13890
  "epoch": 10.048517520215633,
13891
+ "grad_norm": 1.0275684595108032,
13892
  "learning_rate": 0.00047956518925913944,
13893
  "loss": 3.3284,
13894
  "step": 93200
13895
  },
13896
  {
13897
  "epoch": 10.053908355795148,
13898
+ "grad_norm": 0.9490914940834045,
13899
  "learning_rate": 0.0004795004852798447,
13900
  "loss": 3.3278,
13901
  "step": 93250
13902
  },
13903
  {
13904
  "epoch": 10.059299191374663,
13905
+ "grad_norm": 0.9708228707313538,
13906
  "learning_rate": 0.0004794357813005499,
13907
  "loss": 3.3455,
13908
  "step": 93300
13909
  },
13910
  {
13911
  "epoch": 10.064690026954178,
13912
+ "grad_norm": 1.0181645154953003,
13913
  "learning_rate": 0.00047937107732125517,
13914
  "loss": 3.3333,
13915
  "step": 93350
13916
  },
13917
  {
13918
  "epoch": 10.070080862533693,
13919
+ "grad_norm": 0.9478178024291992,
13920
  "learning_rate": 0.0004793063733419605,
13921
  "loss": 3.3459,
13922
  "step": 93400
13923
  },
13924
  {
13925
  "epoch": 10.075471698113208,
13926
+ "grad_norm": 0.9686261415481567,
13927
  "learning_rate": 0.00047924166936266576,
13928
+ "loss": 3.3447,
13929
  "step": 93450
13930
  },
13931
  {
13932
  "epoch": 10.080862533692722,
13933
+ "grad_norm": 0.9726437926292419,
13934
  "learning_rate": 0.000479176965383371,
13935
  "loss": 3.3452,
13936
  "step": 93500
13937
  },
13938
  {
13939
  "epoch": 10.086253369272237,
13940
+ "grad_norm": 0.9335893392562866,
13941
  "learning_rate": 0.0004791122614040763,
13942
+ "loss": 3.3739,
13943
  "step": 93550
13944
  },
13945
  {
13946
  "epoch": 10.091644204851752,
13947
+ "grad_norm": 1.01718270778656,
13948
  "learning_rate": 0.0004790475574247816,
13949
+ "loss": 3.3543,
13950
  "step": 93600
13951
  },
13952
  {
13953
  "epoch": 10.097035040431267,
13954
+ "grad_norm": 0.9226274490356445,
13955
  "learning_rate": 0.00047898285344548684,
13956
+ "loss": 3.3471,
13957
  "step": 93650
13958
  },
13959
  {
13960
  "epoch": 10.102425876010782,
13961
+ "grad_norm": 0.8888355493545532,
13962
  "learning_rate": 0.00047891814946619213,
13963
+ "loss": 3.3745,
13964
  "step": 93700
13965
  },
13966
  {
13967
  "epoch": 10.107816711590296,
13968
+ "grad_norm": 0.9315427541732788,
13969
  "learning_rate": 0.0004788534454868974,
13970
  "loss": 3.358,
13971
  "step": 93750
13972
  },
13973
  {
13974
  "epoch": 10.11320754716981,
13975
+ "grad_norm": 1.0317213535308838,
13976
  "learning_rate": 0.00047878874150760267,
13977
+ "loss": 3.3533,
13978
  "step": 93800
13979
  },
13980
  {
13981
  "epoch": 10.118598382749326,
13982
+ "grad_norm": 0.9369990229606628,
13983
  "learning_rate": 0.00047872403752830797,
13984
+ "loss": 3.3518,
13985
  "step": 93850
13986
  },
13987
  {
13988
  "epoch": 10.123989218328841,
13989
+ "grad_norm": 0.9005582332611084,
13990
  "learning_rate": 0.0004786593335490132,
13991
+ "loss": 3.3633,
13992
  "step": 93900
13993
  },
13994
  {
13995
  "epoch": 10.129380053908356,
13996
+ "grad_norm": 0.8802098035812378,
13997
  "learning_rate": 0.0004785946295697185,
13998
+ "loss": 3.3791,
13999
  "step": 93950
14000
  },
14001
  {
14002
  "epoch": 10.134770889487871,
14003
+ "grad_norm": 0.9288963079452515,
14004
  "learning_rate": 0.0004785299255904238,
14005
+ "loss": 3.3628,
14006
  "step": 94000
14007
  },
14008
  {
14009
  "epoch": 10.134770889487871,
14010
+ "eval_accuracy": 0.37585141761052526,
14011
+ "eval_loss": 3.468660831451416,
14012
+ "eval_runtime": 96.5501,
14013
+ "eval_samples_per_second": 186.546,
14014
+ "eval_steps_per_second": 11.662,
14015
  "step": 94000
14016
  },
14017
  {
14018
  "epoch": 10.140161725067385,
14019
+ "grad_norm": 0.9049438834190369,
14020
  "learning_rate": 0.00047846522161112905,
14021
+ "loss": 3.3837,
14022
  "step": 94050
14023
  },
14024
  {
14025
  "epoch": 10.1455525606469,
14026
+ "grad_norm": 0.8451623916625977,
14027
  "learning_rate": 0.0004784005176318343,
14028
+ "loss": 3.3661,
14029
  "step": 94100
14030
  },
14031
  {
14032
  "epoch": 10.150943396226415,
14033
+ "grad_norm": 0.8525308966636658,
14034
  "learning_rate": 0.00047833581365253964,
14035
+ "loss": 3.3488,
14036
  "step": 94150
14037
  },
14038
  {
14039
  "epoch": 10.15633423180593,
14040
+ "grad_norm": 0.8477214574813843,
14041
  "learning_rate": 0.0004782711096732449,
14042
+ "loss": 3.3774,
14043
  "step": 94200
14044
  },
14045
  {
14046
  "epoch": 10.161725067385445,
14047
+ "grad_norm": 0.8376886248588562,
14048
  "learning_rate": 0.0004782064056939501,
14049
+ "loss": 3.3756,
14050
  "step": 94250
14051
  },
14052
  {
14053
  "epoch": 10.167115902964959,
14054
+ "grad_norm": 0.907804548740387,
14055
  "learning_rate": 0.00047814170171465536,
14056
+ "loss": 3.3656,
14057
  "step": 94300
14058
  },
14059
  {
14060
  "epoch": 10.172506738544474,
14061
+ "grad_norm": 0.8110917806625366,
14062
  "learning_rate": 0.0004780769977353607,
14063
  "loss": 3.384,
14064
  "step": 94350
14065
  },
14066
  {
14067
  "epoch": 10.177897574123989,
14068
+ "grad_norm": 0.8640705943107605,
14069
  "learning_rate": 0.00047801229375606596,
14070
+ "loss": 3.3611,
14071
  "step": 94400
14072
  },
14073
  {
14074
  "epoch": 10.183288409703504,
14075
+ "grad_norm": 0.9154790639877319,
14076
  "learning_rate": 0.0004779475897767712,
14077
+ "loss": 3.3792,
14078
  "step": 94450
14079
  },
14080
  {
14081
  "epoch": 10.18867924528302,
14082
+ "grad_norm": 0.8714333176612854,
14083
  "learning_rate": 0.00047788288579747655,
14084
+ "loss": 3.374,
14085
  "step": 94500
14086
  },
14087
  {
14088
  "epoch": 10.194070080862534,
14089
+ "grad_norm": 0.8845873475074768,
14090
  "learning_rate": 0.0004778181818181818,
14091
+ "loss": 3.3883,
14092
  "step": 94550
14093
  },
14094
  {
14095
  "epoch": 10.199460916442048,
14096
+ "grad_norm": 0.8398736715316772,
14097
  "learning_rate": 0.00047775347783888703,
14098
+ "loss": 3.3808,
14099
  "step": 94600
14100
  },
14101
  {
14102
  "epoch": 10.204851752021563,
14103
+ "grad_norm": 0.9570174813270569,
14104
  "learning_rate": 0.0004776887738595923,
14105
+ "loss": 3.3825,
14106
  "step": 94650
14107
  },
14108
  {
14109
  "epoch": 10.210242587601078,
14110
+ "grad_norm": 0.8978366255760193,
14111
  "learning_rate": 0.0004776240698802976,
14112
+ "loss": 3.3948,
14113
  "step": 94700
14114
  },
14115
  {
14116
  "epoch": 10.215633423180593,
14117
+ "grad_norm": 0.8541731238365173,
14118
  "learning_rate": 0.00047755936590100287,
14119
+ "loss": 3.4001,
14120
  "step": 94750
14121
  },
14122
  {
14123
  "epoch": 10.221024258760108,
14124
+ "grad_norm": 0.7903949618339539,
14125
  "learning_rate": 0.0004774946619217081,
14126
+ "loss": 3.3881,
14127
  "step": 94800
14128
  },
14129
  {
14130
  "epoch": 10.226415094339623,
14131
+ "grad_norm": 0.833877444267273,
14132
  "learning_rate": 0.0004774299579424134,
14133
  "loss": 3.3848,
14134
  "step": 94850
14135
  },
14136
  {
14137
  "epoch": 10.231805929919137,
14138
+ "grad_norm": 0.7892668843269348,
14139
  "learning_rate": 0.0004773652539631187,
14140
+ "loss": 3.3865,
14141
  "step": 94900
14142
  },
14143
  {
14144
  "epoch": 10.237196765498652,
14145
+ "grad_norm": 0.8454469442367554,
14146
  "learning_rate": 0.00047730054998382395,
14147
+ "loss": 3.3881,
14148
  "step": 94950
14149
  },
14150
  {
14151
  "epoch": 10.242587601078167,
14152
+ "grad_norm": 0.8297915458679199,
14153
  "learning_rate": 0.00047723584600452924,
14154
+ "loss": 3.3893,
14155
  "step": 95000
14156
  },
14157
  {
14158
  "epoch": 10.242587601078167,
14159
+ "eval_accuracy": 0.3762282258254659,
14160
+ "eval_loss": 3.463374614715576,
14161
+ "eval_runtime": 98.3735,
14162
+ "eval_samples_per_second": 183.088,
14163
+ "eval_steps_per_second": 11.446,
14164
  "step": 95000
14165
  },
14166
  {
14167
  "epoch": 10.242587601078167,
14168
  "step": 95000,
14169
  "total_flos": 7.94262454272e+17,
14170
+ "train_loss": 0.07933711596037211,
14171
+ "train_runtime": 850.9952,
14172
+ "train_samples_per_second": 17436.937,
14173
+ "train_steps_per_second": 544.95
14174
  }
14175
  ],
14176
  "logging_steps": 50,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e4fda409c4b28b17c8ab2eb26acfdfdf4cbb3610fee56a2c7c5c46dcda4c5df3
3
  size 5304
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2676594d525aaca5c6f85bae5e1a449f3ab6327cccd5609a17294848755bfed
3
  size 5304