craa commited on
Commit
4e1be92
·
verified ·
1 Parent(s): d942429

Model save

Browse files
README.md CHANGED
@@ -16,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
16
 
17
  This model is a fine-tuned version of [](https://huggingface.co/) on an unknown dataset.
18
  It achieves the following results on the evaluation set:
19
- - Loss: 3.4624
20
  - Accuracy: 0.3763
21
 
22
  ## Model description
@@ -143,8 +143,8 @@ The following hyperparameters were used during training:
143
  | 3.1314 | 9.8113 | 91000 | 0.3943 | 3.3021 |
144
  | 3.1061 | 9.9191 | 92000 | 0.3946 | 3.3004 |
145
  | 3.2987 | 10.0270 | 93000 | 3.4598 | 0.3772 |
146
- | 3.3802 | 10.1348 | 94000 | 3.4702 | 0.3756 |
147
- | 3.3824 | 10.2426 | 95000 | 3.4624 | 0.3763 |
148
 
149
 
150
  ### Framework versions
 
16
 
17
  This model is a fine-tuned version of [](https://huggingface.co/) on an unknown dataset.
18
  It achieves the following results on the evaluation set:
19
+ - Loss: 3.4625
20
  - Accuracy: 0.3763
21
 
22
  ## Model description
 
143
  | 3.1314 | 9.8113 | 91000 | 0.3943 | 3.3021 |
144
  | 3.1061 | 9.9191 | 92000 | 0.3946 | 3.3004 |
145
  | 3.2987 | 10.0270 | 93000 | 3.4598 | 0.3772 |
146
+ | 3.3802 | 10.1348 | 94000 | 3.4705 | 0.3756 |
147
+ | 3.3831 | 10.2426 | 95000 | 3.4625 | 0.3763 |
148
 
149
 
150
  ### Framework versions
all_results.json CHANGED
@@ -1,16 +1,16 @@
1
  {
2
- "epoch": 10.0,
3
- "eval_accuracy": 0.39428448868113974,
4
- "eval_loss": 3.3030037879943848,
5
- "eval_runtime": 181.4646,
6
  "eval_samples": 18011,
7
- "eval_samples_per_second": 99.254,
8
- "eval_steps_per_second": 6.205,
9
- "perplexity": 27.194201977682727,
10
- "total_flos": 7.75449427968e+17,
11
- "train_loss": 3.4548003072815763,
12
- "train_runtime": 79154.1696,
13
  "train_samples": 296775,
14
- "train_samples_per_second": 37.493,
15
- "train_steps_per_second": 1.172
16
  }
 
1
  {
2
+ "epoch": 10.242587601078167,
3
+ "eval_accuracy": 0.3942879655735498,
4
+ "eval_loss": 3.303001642227173,
5
+ "eval_runtime": 145.5138,
6
  "eval_samples": 18011,
7
+ "eval_samples_per_second": 123.775,
8
+ "eval_steps_per_second": 7.738,
9
+ "perplexity": 27.19414362531837,
10
+ "total_flos": 7.94262454272e+17,
11
+ "train_loss": 0.07928502052708676,
12
+ "train_runtime": 1615.2002,
13
  "train_samples": 296775,
14
+ "train_samples_per_second": 9186.942,
15
+ "train_steps_per_second": 287.116
16
  }
eval_results.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "epoch": 10.0,
3
- "eval_accuracy": 0.39428448868113974,
4
- "eval_loss": 3.3030037879943848,
5
- "eval_runtime": 181.4646,
6
  "eval_samples": 18011,
7
- "eval_samples_per_second": 99.254,
8
- "eval_steps_per_second": 6.205,
9
- "perplexity": 27.194201977682727
10
  }
 
1
  {
2
+ "epoch": 10.242587601078167,
3
+ "eval_accuracy": 0.3942879655735498,
4
+ "eval_loss": 3.303001642227173,
5
+ "eval_runtime": 145.5138,
6
  "eval_samples": 18011,
7
+ "eval_samples_per_second": 123.775,
8
+ "eval_steps_per_second": 7.738,
9
+ "perplexity": 27.19414362531837
10
  }
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 10.0,
3
- "total_flos": 7.75449427968e+17,
4
- "train_loss": 3.4548003072815763,
5
- "train_runtime": 79154.1696,
6
  "train_samples": 296775,
7
- "train_samples_per_second": 37.493,
8
- "train_steps_per_second": 1.172
9
  }
 
1
  {
2
+ "epoch": 10.242587601078167,
3
+ "total_flos": 7.94262454272e+17,
4
+ "train_loss": 0.07928502052708676,
5
+ "train_runtime": 1615.2002,
6
  "train_samples": 296775,
7
+ "train_samples_per_second": 9186.942,
8
+ "train_steps_per_second": 287.116
9
  }
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": 3.3030037879943848,
3
  "best_model_checkpoint": "/scratch/cl5625/exceptions/models/100M__634/checkpoint-90000",
4
- "epoch": 10.0,
5
  "eval_steps": 1000,
6
- "global_step": 92750,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -13822,19 +13822,361 @@
13822
  "step": 92750
13823
  },
13824
  {
13825
- "epoch": 10.0,
13826
- "step": 92750,
13827
- "total_flos": 7.75449427968e+17,
13828
- "train_loss": 3.4548003072815763,
13829
- "train_runtime": 79154.1696,
13830
- "train_samples_per_second": 37.493,
13831
- "train_steps_per_second": 1.172
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13832
  }
13833
  ],
13834
  "logging_steps": 50,
13835
- "max_steps": 92750,
13836
  "num_input_tokens_seen": 0,
13837
- "num_train_epochs": 10,
13838
  "save_steps": 10000,
13839
  "stateful_callbacks": {
13840
  "TrainerControl": {
@@ -13848,7 +14190,7 @@
13848
  "attributes": {}
13849
  }
13850
  },
13851
- "total_flos": 7.75449427968e+17,
13852
  "train_batch_size": 32,
13853
  "trial_name": null,
13854
  "trial_params": null
 
1
  {
2
  "best_metric": 3.3030037879943848,
3
  "best_model_checkpoint": "/scratch/cl5625/exceptions/models/100M__634/checkpoint-90000",
4
+ "epoch": 10.242587601078167,
5
  "eval_steps": 1000,
6
+ "global_step": 95000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
13822
  "step": 92750
13823
  },
13824
  {
13825
+ "epoch": 10.005390835579515,
13826
+ "grad_norm": 1.0576950311660767,
13827
+ "learning_rate": 0.000480085409252669,
13828
+ "loss": 3.1663,
13829
+ "step": 92800
13830
+ },
13831
+ {
13832
+ "epoch": 10.01078167115903,
13833
+ "grad_norm": 1.019791603088379,
13834
+ "learning_rate": 0.00048002070527337426,
13835
+ "loss": 3.2216,
13836
+ "step": 92850
13837
+ },
13838
+ {
13839
+ "epoch": 10.016172506738544,
13840
+ "grad_norm": 0.9198017120361328,
13841
+ "learning_rate": 0.00047995600129407956,
13842
+ "loss": 3.2466,
13843
+ "step": 92900
13844
+ },
13845
+ {
13846
+ "epoch": 10.021563342318059,
13847
+ "grad_norm": 1.101521372795105,
13848
+ "learning_rate": 0.0004798912973147848,
13849
+ "loss": 3.2908,
13850
+ "step": 92950
13851
+ },
13852
+ {
13853
+ "epoch": 10.026954177897574,
13854
+ "grad_norm": 1.0034935474395752,
13855
+ "learning_rate": 0.0004798265933354901,
13856
+ "loss": 3.2987,
13857
+ "step": 93000
13858
+ },
13859
+ {
13860
+ "epoch": 10.026954177897574,
13861
+ "eval_accuracy": 0.37723413426085234,
13862
+ "eval_loss": 3.459772825241089,
13863
+ "eval_runtime": 146.5373,
13864
+ "eval_samples_per_second": 122.911,
13865
+ "eval_steps_per_second": 7.684,
13866
+ "step": 93000
13867
+ },
13868
+ {
13869
+ "epoch": 10.032345013477089,
13870
+ "grad_norm": 0.9505324363708496,
13871
+ "learning_rate": 0.0004797618893561954,
13872
+ "loss": 3.2895,
13873
+ "step": 93050
13874
+ },
13875
+ {
13876
+ "epoch": 10.037735849056604,
13877
+ "grad_norm": 1.004460334777832,
13878
+ "learning_rate": 0.00047969718537690064,
13879
+ "loss": 3.3141,
13880
+ "step": 93100
13881
+ },
13882
+ {
13883
+ "epoch": 10.04312668463612,
13884
+ "grad_norm": 0.9728407263755798,
13885
+ "learning_rate": 0.0004796324813976059,
13886
+ "loss": 3.3228,
13887
+ "step": 93150
13888
+ },
13889
+ {
13890
+ "epoch": 10.048517520215633,
13891
+ "grad_norm": 1.1120251417160034,
13892
+ "learning_rate": 0.00047956777741831123,
13893
+ "loss": 3.3313,
13894
+ "step": 93200
13895
+ },
13896
+ {
13897
+ "epoch": 10.053908355795148,
13898
+ "grad_norm": 0.960462212562561,
13899
+ "learning_rate": 0.00047950307343901647,
13900
+ "loss": 3.3163,
13901
+ "step": 93250
13902
+ },
13903
+ {
13904
+ "epoch": 10.059299191374663,
13905
+ "grad_norm": 0.9895879030227661,
13906
+ "learning_rate": 0.0004794383694597217,
13907
+ "loss": 3.3216,
13908
+ "step": 93300
13909
+ },
13910
+ {
13911
+ "epoch": 10.064690026954178,
13912
+ "grad_norm": 0.9253028035163879,
13913
+ "learning_rate": 0.00047937366548042696,
13914
+ "loss": 3.3511,
13915
+ "step": 93350
13916
+ },
13917
+ {
13918
+ "epoch": 10.070080862533693,
13919
+ "grad_norm": 1.0076348781585693,
13920
+ "learning_rate": 0.0004793089615011323,
13921
+ "loss": 3.3373,
13922
+ "step": 93400
13923
+ },
13924
+ {
13925
+ "epoch": 10.075471698113208,
13926
+ "grad_norm": 1.0603855848312378,
13927
+ "learning_rate": 0.00047924425752183755,
13928
+ "loss": 3.3447,
13929
+ "step": 93450
13930
+ },
13931
+ {
13932
+ "epoch": 10.080862533692722,
13933
+ "grad_norm": 0.9239181280136108,
13934
+ "learning_rate": 0.0004791795535425428,
13935
+ "loss": 3.334,
13936
+ "step": 93500
13937
+ },
13938
+ {
13939
+ "epoch": 10.086253369272237,
13940
+ "grad_norm": 0.954738974571228,
13941
+ "learning_rate": 0.00047911484956324814,
13942
+ "loss": 3.3477,
13943
+ "step": 93550
13944
+ },
13945
+ {
13946
+ "epoch": 10.091644204851752,
13947
+ "grad_norm": 0.9450059533119202,
13948
+ "learning_rate": 0.0004790501455839534,
13949
+ "loss": 3.3501,
13950
+ "step": 93600
13951
+ },
13952
+ {
13953
+ "epoch": 10.097035040431267,
13954
+ "grad_norm": 0.958166241645813,
13955
+ "learning_rate": 0.0004789854416046586,
13956
+ "loss": 3.3689,
13957
+ "step": 93650
13958
+ },
13959
+ {
13960
+ "epoch": 10.102425876010782,
13961
+ "grad_norm": 0.9498693943023682,
13962
+ "learning_rate": 0.00047892073762536387,
13963
+ "loss": 3.3668,
13964
+ "step": 93700
13965
+ },
13966
+ {
13967
+ "epoch": 10.107816711590296,
13968
+ "grad_norm": 0.9249439835548401,
13969
+ "learning_rate": 0.0004788560336460692,
13970
+ "loss": 3.3628,
13971
+ "step": 93750
13972
+ },
13973
+ {
13974
+ "epoch": 10.11320754716981,
13975
+ "grad_norm": 0.9309000372886658,
13976
+ "learning_rate": 0.00047879132966677446,
13977
+ "loss": 3.3635,
13978
+ "step": 93800
13979
+ },
13980
+ {
13981
+ "epoch": 10.118598382749326,
13982
+ "grad_norm": 0.9265908002853394,
13983
+ "learning_rate": 0.0004787266256874797,
13984
+ "loss": 3.3733,
13985
+ "step": 93850
13986
+ },
13987
+ {
13988
+ "epoch": 10.123989218328841,
13989
+ "grad_norm": 0.9763278961181641,
13990
+ "learning_rate": 0.000478661921708185,
13991
+ "loss": 3.348,
13992
+ "step": 93900
13993
+ },
13994
+ {
13995
+ "epoch": 10.129380053908356,
13996
+ "grad_norm": 0.8944584727287292,
13997
+ "learning_rate": 0.0004785972177288903,
13998
+ "loss": 3.3714,
13999
+ "step": 93950
14000
+ },
14001
+ {
14002
+ "epoch": 10.134770889487871,
14003
+ "grad_norm": 1.097298264503479,
14004
+ "learning_rate": 0.00047853251374959554,
14005
+ "loss": 3.3802,
14006
+ "step": 94000
14007
+ },
14008
+ {
14009
+ "epoch": 10.134770889487871,
14010
+ "eval_accuracy": 0.3755615317058362,
14011
+ "eval_loss": 3.470184803009033,
14012
+ "eval_runtime": 148.0392,
14013
+ "eval_samples_per_second": 121.664,
14014
+ "eval_steps_per_second": 7.606,
14015
+ "step": 94000
14016
+ },
14017
+ {
14018
+ "epoch": 10.140161725067385,
14019
+ "grad_norm": 0.893616259098053,
14020
+ "learning_rate": 0.00047846780977030083,
14021
+ "loss": 3.3583,
14022
+ "step": 94050
14023
+ },
14024
+ {
14025
+ "epoch": 10.1455525606469,
14026
+ "grad_norm": 0.8614599108695984,
14027
+ "learning_rate": 0.00047840310579100613,
14028
+ "loss": 3.3667,
14029
+ "step": 94100
14030
+ },
14031
+ {
14032
+ "epoch": 10.150943396226415,
14033
+ "grad_norm": 0.9499636292457581,
14034
+ "learning_rate": 0.00047833840181171137,
14035
+ "loss": 3.3603,
14036
+ "step": 94150
14037
+ },
14038
+ {
14039
+ "epoch": 10.15633423180593,
14040
+ "grad_norm": 0.9031397104263306,
14041
+ "learning_rate": 0.00047827369783241667,
14042
+ "loss": 3.3549,
14043
+ "step": 94200
14044
+ },
14045
+ {
14046
+ "epoch": 10.161725067385445,
14047
+ "grad_norm": 0.8364366888999939,
14048
+ "learning_rate": 0.0004782089938531219,
14049
+ "loss": 3.3738,
14050
+ "step": 94250
14051
+ },
14052
+ {
14053
+ "epoch": 10.167115902964959,
14054
+ "grad_norm": 0.8245320320129395,
14055
+ "learning_rate": 0.0004781442898738272,
14056
+ "loss": 3.3764,
14057
+ "step": 94300
14058
+ },
14059
+ {
14060
+ "epoch": 10.172506738544474,
14061
+ "grad_norm": 0.8197740316390991,
14062
+ "learning_rate": 0.0004780795858945325,
14063
+ "loss": 3.3841,
14064
+ "step": 94350
14065
+ },
14066
+ {
14067
+ "epoch": 10.177897574123989,
14068
+ "grad_norm": 0.8775398135185242,
14069
+ "learning_rate": 0.00047801488191523775,
14070
+ "loss": 3.3669,
14071
+ "step": 94400
14072
+ },
14073
+ {
14074
+ "epoch": 10.183288409703504,
14075
+ "grad_norm": 0.9364191889762878,
14076
+ "learning_rate": 0.000477950177935943,
14077
+ "loss": 3.3592,
14078
+ "step": 94450
14079
+ },
14080
+ {
14081
+ "epoch": 10.18867924528302,
14082
+ "grad_norm": 0.8914583921432495,
14083
+ "learning_rate": 0.00047788547395664834,
14084
+ "loss": 3.3755,
14085
+ "step": 94500
14086
+ },
14087
+ {
14088
+ "epoch": 10.194070080862534,
14089
+ "grad_norm": 0.7897307872772217,
14090
+ "learning_rate": 0.0004778207699773536,
14091
+ "loss": 3.3799,
14092
+ "step": 94550
14093
+ },
14094
+ {
14095
+ "epoch": 10.199460916442048,
14096
+ "grad_norm": 0.83349609375,
14097
+ "learning_rate": 0.0004777560659980588,
14098
+ "loss": 3.3733,
14099
+ "step": 94600
14100
+ },
14101
+ {
14102
+ "epoch": 10.204851752021563,
14103
+ "grad_norm": 0.9644209146499634,
14104
+ "learning_rate": 0.00047769136201876406,
14105
+ "loss": 3.3894,
14106
+ "step": 94650
14107
+ },
14108
+ {
14109
+ "epoch": 10.210242587601078,
14110
+ "grad_norm": 0.7957009077072144,
14111
+ "learning_rate": 0.0004776266580394694,
14112
+ "loss": 3.3855,
14113
+ "step": 94700
14114
+ },
14115
+ {
14116
+ "epoch": 10.215633423180593,
14117
+ "grad_norm": 0.8263417482376099,
14118
+ "learning_rate": 0.00047756195406017466,
14119
+ "loss": 3.3857,
14120
+ "step": 94750
14121
+ },
14122
+ {
14123
+ "epoch": 10.221024258760108,
14124
+ "grad_norm": 0.8626648187637329,
14125
+ "learning_rate": 0.0004774972500808799,
14126
+ "loss": 3.3935,
14127
+ "step": 94800
14128
+ },
14129
+ {
14130
+ "epoch": 10.226415094339623,
14131
+ "grad_norm": 0.8164050579071045,
14132
+ "learning_rate": 0.00047743254610158525,
14133
+ "loss": 3.3991,
14134
+ "step": 94850
14135
+ },
14136
+ {
14137
+ "epoch": 10.231805929919137,
14138
+ "grad_norm": 0.9200409054756165,
14139
+ "learning_rate": 0.0004773678421222905,
14140
+ "loss": 3.3668,
14141
+ "step": 94900
14142
+ },
14143
+ {
14144
+ "epoch": 10.237196765498652,
14145
+ "grad_norm": 0.7908764481544495,
14146
+ "learning_rate": 0.00047730313814299573,
14147
+ "loss": 3.3905,
14148
+ "step": 94950
14149
+ },
14150
+ {
14151
+ "epoch": 10.242587601078167,
14152
+ "grad_norm": 0.7977242469787598,
14153
+ "learning_rate": 0.000477238434163701,
14154
+ "loss": 3.3824,
14155
+ "step": 95000
14156
+ },
14157
+ {
14158
+ "epoch": 10.242587601078167,
14159
+ "eval_accuracy": 0.376265059154435,
14160
+ "eval_loss": 3.4624273777008057,
14161
+ "eval_runtime": 146.0721,
14162
+ "eval_samples_per_second": 123.302,
14163
+ "eval_steps_per_second": 7.709,
14164
+ "step": 95000
14165
+ },
14166
+ {
14167
+ "epoch": 10.242587601078167,
14168
+ "step": 95000,
14169
+ "total_flos": 7.94262454272e+17,
14170
+ "train_loss": 0.07928502052708676,
14171
+ "train_runtime": 1615.2002,
14172
+ "train_samples_per_second": 9186.942,
14173
+ "train_steps_per_second": 287.116
14174
  }
14175
  ],
14176
  "logging_steps": 50,
14177
+ "max_steps": 463750,
14178
  "num_input_tokens_seen": 0,
14179
+ "num_train_epochs": 50,
14180
  "save_steps": 10000,
14181
  "stateful_callbacks": {
14182
  "TrainerControl": {
 
14190
  "attributes": {}
14191
  }
14192
  },
14193
+ "total_flos": 7.94262454272e+17,
14194
  "train_batch_size": 32,
14195
  "trial_name": null,
14196
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6dbba059ae4e965260f4be9ebe73f66dc0ec9b205428e29ea65d2a6a6ded9fe2
3
  size 5304
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed851c4d340fa66276b0dfb5ef19368092eb830abe5fc6c31e1a867a4b565c42
3
  size 5304