craa commited on
Commit
739973e
·
verified ·
1 Parent(s): 4f008ed

Model save

Browse files
README.md CHANGED
@@ -142,9 +142,9 @@ The following hyperparameters were used during training:
142
  | 3.1393 | 9.7035 | 90000 | 0.3945 | 3.3022 |
143
  | 3.1482 | 9.8113 | 91000 | 0.3948 | 3.3000 |
144
  | 3.1491 | 9.9191 | 92000 | 0.3950 | 3.2984 |
145
- | 3.3065 | 10.0270 | 93000 | 3.4581 | 0.3773 |
146
- | 3.3805 | 10.1348 | 94000 | 3.4696 | 0.3763 |
147
- | 3.3884 | 10.2426 | 95000 | 3.4676 | 0.3760 |
148
 
149
 
150
  ### Framework versions
 
142
  | 3.1393 | 9.7035 | 90000 | 0.3945 | 3.3022 |
143
  | 3.1482 | 9.8113 | 91000 | 0.3948 | 3.3000 |
144
  | 3.1491 | 9.9191 | 92000 | 0.3950 | 3.2984 |
145
+ | 3.3064 | 10.0270 | 93000 | 3.4581 | 0.3773 |
146
+ | 3.3807 | 10.1348 | 94000 | 3.4690 | 0.3764 |
147
+ | 3.3886 | 10.2426 | 95000 | 3.4676 | 0.3760 |
148
 
149
 
150
  ### Framework versions
all_results.json CHANGED
@@ -1,16 +1,16 @@
1
  {
2
- "epoch": 10.0,
3
  "eval_accuracy": 0.3945371066453084,
4
  "eval_loss": 3.302194118499756,
5
- "eval_runtime": 144.899,
6
  "eval_samples": 18011,
7
- "eval_samples_per_second": 124.3,
8
- "eval_steps_per_second": 7.771,
9
  "perplexity": 27.17219257328467,
10
- "total_flos": 7.75449427968e+17,
11
- "train_loss": 3.4695698613991954,
12
- "train_runtime": 61247.9496,
13
  "train_samples": 296775,
14
- "train_samples_per_second": 48.455,
15
- "train_steps_per_second": 1.514
16
  }
 
1
  {
2
+ "epoch": 10.242587601078167,
3
  "eval_accuracy": 0.3945371066453084,
4
  "eval_loss": 3.302194118499756,
5
+ "eval_runtime": 147.1746,
6
  "eval_samples": 18011,
7
+ "eval_samples_per_second": 122.378,
8
+ "eval_steps_per_second": 7.651,
9
  "perplexity": 27.17219257328467,
10
+ "total_flos": 7.94262454272e+17,
11
+ "train_loss": 0.07955799303556743,
12
+ "train_runtime": 1634.6435,
13
  "train_samples": 296775,
14
+ "train_samples_per_second": 9077.667,
15
+ "train_steps_per_second": 283.701
16
  }
eval_results.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "epoch": 10.0,
3
  "eval_accuracy": 0.3945371066453084,
4
  "eval_loss": 3.302194118499756,
5
- "eval_runtime": 144.899,
6
  "eval_samples": 18011,
7
- "eval_samples_per_second": 124.3,
8
- "eval_steps_per_second": 7.771,
9
  "perplexity": 27.17219257328467
10
  }
 
1
  {
2
+ "epoch": 10.242587601078167,
3
  "eval_accuracy": 0.3945371066453084,
4
  "eval_loss": 3.302194118499756,
5
+ "eval_runtime": 147.1746,
6
  "eval_samples": 18011,
7
+ "eval_samples_per_second": 122.378,
8
+ "eval_steps_per_second": 7.651,
9
  "perplexity": 27.17219257328467
10
  }
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 10.0,
3
- "total_flos": 7.75449427968e+17,
4
- "train_loss": 3.4695698613991954,
5
- "train_runtime": 61247.9496,
6
  "train_samples": 296775,
7
- "train_samples_per_second": 48.455,
8
- "train_steps_per_second": 1.514
9
  }
 
1
  {
2
+ "epoch": 10.242587601078167,
3
+ "total_flos": 7.94262454272e+17,
4
+ "train_loss": 0.07955799303556743,
5
+ "train_runtime": 1634.6435,
6
  "train_samples": 296775,
7
+ "train_samples_per_second": 9077.667,
8
+ "train_steps_per_second": 283.701
9
  }
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": 3.302194118499756,
3
  "best_model_checkpoint": "/scratch/cl5625/exceptions/models/100M__495/checkpoint-90000",
4
- "epoch": 10.0,
5
  "eval_steps": 1000,
6
- "global_step": 92750,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -13822,19 +13822,361 @@
13822
  "step": 92750
13823
  },
13824
  {
13825
- "epoch": 10.0,
13826
- "step": 92750,
13827
- "total_flos": 7.75449427968e+17,
13828
- "train_loss": 3.4695698613991954,
13829
- "train_runtime": 61247.9496,
13830
- "train_samples_per_second": 48.455,
13831
- "train_steps_per_second": 1.514
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13832
  }
13833
  ],
13834
  "logging_steps": 50,
13835
- "max_steps": 92750,
13836
  "num_input_tokens_seen": 0,
13837
- "num_train_epochs": 10,
13838
  "save_steps": 10000,
13839
  "stateful_callbacks": {
13840
  "TrainerControl": {
@@ -13848,7 +14190,7 @@
13848
  "attributes": {}
13849
  }
13850
  },
13851
- "total_flos": 7.75449427968e+17,
13852
  "train_batch_size": 32,
13853
  "trial_name": null,
13854
  "trial_params": null
 
1
  {
2
  "best_metric": 3.302194118499756,
3
  "best_model_checkpoint": "/scratch/cl5625/exceptions/models/100M__495/checkpoint-90000",
4
+ "epoch": 10.242587601078167,
5
  "eval_steps": 1000,
6
+ "global_step": 95000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
13822
  "step": 92750
13823
  },
13824
  {
13825
+ "epoch": 10.005390835579515,
13826
+ "grad_norm": 1.025435209274292,
13827
+ "learning_rate": 0.0004800879974118408,
13828
+ "loss": 3.1767,
13829
+ "step": 92800
13830
+ },
13831
+ {
13832
+ "epoch": 10.01078167115903,
13833
+ "grad_norm": 1.0077670812606812,
13834
+ "learning_rate": 0.00048002329343254605,
13835
+ "loss": 3.2428,
13836
+ "step": 92850
13837
+ },
13838
+ {
13839
+ "epoch": 10.016172506738544,
13840
+ "grad_norm": 1.0313729047775269,
13841
+ "learning_rate": 0.0004799585894532513,
13842
+ "loss": 3.277,
13843
+ "step": 92900
13844
+ },
13845
+ {
13846
+ "epoch": 10.021563342318059,
13847
+ "grad_norm": 0.9879314303398132,
13848
+ "learning_rate": 0.0004798938854739566,
13849
+ "loss": 3.294,
13850
+ "step": 92950
13851
+ },
13852
+ {
13853
+ "epoch": 10.026954177897574,
13854
+ "grad_norm": 1.1394336223602295,
13855
+ "learning_rate": 0.0004798291814946619,
13856
+ "loss": 3.3065,
13857
+ "step": 93000
13858
+ },
13859
+ {
13860
+ "epoch": 10.026954177897574,
13861
+ "eval_accuracy": 0.3772760742755487,
13862
+ "eval_loss": 3.4581103324890137,
13863
+ "eval_runtime": 148.4364,
13864
+ "eval_samples_per_second": 121.338,
13865
+ "eval_steps_per_second": 7.586,
13866
+ "step": 93000
13867
+ },
13868
+ {
13869
+ "epoch": 10.032345013477089,
13870
+ "grad_norm": 1.0531760454177856,
13871
+ "learning_rate": 0.00047976447751536713,
13872
+ "loss": 3.3043,
13873
+ "step": 93050
13874
+ },
13875
+ {
13876
+ "epoch": 10.037735849056604,
13877
+ "grad_norm": 1.0207242965698242,
13878
+ "learning_rate": 0.0004796997735360724,
13879
+ "loss": 3.3286,
13880
+ "step": 93100
13881
+ },
13882
+ {
13883
+ "epoch": 10.04312668463612,
13884
+ "grad_norm": 0.9906688332557678,
13885
+ "learning_rate": 0.0004796350695567777,
13886
+ "loss": 3.329,
13887
+ "step": 93150
13888
+ },
13889
+ {
13890
+ "epoch": 10.048517520215633,
13891
+ "grad_norm": 0.9772278070449829,
13892
+ "learning_rate": 0.00047957036557748296,
13893
+ "loss": 3.34,
13894
+ "step": 93200
13895
+ },
13896
+ {
13897
+ "epoch": 10.053908355795148,
13898
+ "grad_norm": 1.0326145887374878,
13899
+ "learning_rate": 0.00047950566159818826,
13900
+ "loss": 3.3329,
13901
+ "step": 93250
13902
+ },
13903
+ {
13904
+ "epoch": 10.059299191374663,
13905
+ "grad_norm": 1.0279661417007446,
13906
+ "learning_rate": 0.0004794409576188935,
13907
+ "loss": 3.3593,
13908
+ "step": 93300
13909
+ },
13910
+ {
13911
+ "epoch": 10.064690026954178,
13912
+ "grad_norm": 1.0010513067245483,
13913
+ "learning_rate": 0.0004793762536395988,
13914
+ "loss": 3.3611,
13915
+ "step": 93350
13916
+ },
13917
+ {
13918
+ "epoch": 10.070080862533693,
13919
+ "grad_norm": 0.8727930188179016,
13920
+ "learning_rate": 0.0004793115496603041,
13921
+ "loss": 3.3464,
13922
+ "step": 93400
13923
+ },
13924
+ {
13925
+ "epoch": 10.075471698113208,
13926
+ "grad_norm": 0.928733766078949,
13927
+ "learning_rate": 0.00047924684568100934,
13928
+ "loss": 3.3566,
13929
+ "step": 93450
13930
+ },
13931
+ {
13932
+ "epoch": 10.080862533692722,
13933
+ "grad_norm": 0.9133866429328918,
13934
+ "learning_rate": 0.0004791821417017146,
13935
+ "loss": 3.3484,
13936
+ "step": 93500
13937
+ },
13938
+ {
13939
+ "epoch": 10.086253369272237,
13940
+ "grad_norm": 0.9412495493888855,
13941
+ "learning_rate": 0.00047911743772241993,
13942
+ "loss": 3.3635,
13943
+ "step": 93550
13944
+ },
13945
+ {
13946
+ "epoch": 10.091644204851752,
13947
+ "grad_norm": 0.9078878164291382,
13948
+ "learning_rate": 0.00047905273374312517,
13949
+ "loss": 3.3718,
13950
+ "step": 93600
13951
+ },
13952
+ {
13953
+ "epoch": 10.097035040431267,
13954
+ "grad_norm": 0.9754884243011475,
13955
+ "learning_rate": 0.0004789880297638304,
13956
+ "loss": 3.3655,
13957
+ "step": 93650
13958
+ },
13959
+ {
13960
+ "epoch": 10.102425876010782,
13961
+ "grad_norm": 0.9449187517166138,
13962
+ "learning_rate": 0.00047892332578453576,
13963
+ "loss": 3.3622,
13964
+ "step": 93700
13965
+ },
13966
+ {
13967
+ "epoch": 10.107816711590296,
13968
+ "grad_norm": 0.970542848110199,
13969
+ "learning_rate": 0.000478858621805241,
13970
+ "loss": 3.3786,
13971
+ "step": 93750
13972
+ },
13973
+ {
13974
+ "epoch": 10.11320754716981,
13975
+ "grad_norm": 0.990994393825531,
13976
+ "learning_rate": 0.00047879391782594625,
13977
+ "loss": 3.3696,
13978
+ "step": 93800
13979
+ },
13980
+ {
13981
+ "epoch": 10.118598382749326,
13982
+ "grad_norm": 0.8434808850288391,
13983
+ "learning_rate": 0.0004787292138466515,
13984
+ "loss": 3.3601,
13985
+ "step": 93850
13986
+ },
13987
+ {
13988
+ "epoch": 10.123989218328841,
13989
+ "grad_norm": 0.955342710018158,
13990
+ "learning_rate": 0.00047866450986735684,
13991
+ "loss": 3.3868,
13992
+ "step": 93900
13993
+ },
13994
+ {
13995
+ "epoch": 10.129380053908356,
13996
+ "grad_norm": 0.8750738501548767,
13997
+ "learning_rate": 0.0004785998058880621,
13998
+ "loss": 3.3788,
13999
+ "step": 93950
14000
+ },
14001
+ {
14002
+ "epoch": 10.134770889487871,
14003
+ "grad_norm": 0.9050654768943787,
14004
+ "learning_rate": 0.0004785351019087673,
14005
+ "loss": 3.3805,
14006
+ "step": 94000
14007
+ },
14008
+ {
14009
+ "epoch": 10.134770889487871,
14010
+ "eval_accuracy": 0.3762606043860346,
14011
+ "eval_loss": 3.4695773124694824,
14012
+ "eval_runtime": 148.5415,
14013
+ "eval_samples_per_second": 121.252,
14014
+ "eval_steps_per_second": 7.58,
14015
+ "step": 94000
14016
+ },
14017
+ {
14018
+ "epoch": 10.140161725067385,
14019
+ "grad_norm": 0.9092472791671753,
14020
+ "learning_rate": 0.00047847039792947257,
14021
+ "loss": 3.3753,
14022
+ "step": 94050
14023
+ },
14024
+ {
14025
+ "epoch": 10.1455525606469,
14026
+ "grad_norm": 0.913253128528595,
14027
+ "learning_rate": 0.0004784056939501779,
14028
+ "loss": 3.379,
14029
+ "step": 94100
14030
+ },
14031
+ {
14032
+ "epoch": 10.150943396226415,
14033
+ "grad_norm": 0.9133390784263611,
14034
+ "learning_rate": 0.00047834098997088316,
14035
+ "loss": 3.3794,
14036
+ "step": 94150
14037
+ },
14038
+ {
14039
+ "epoch": 10.15633423180593,
14040
+ "grad_norm": 0.8519137501716614,
14041
+ "learning_rate": 0.0004782762859915884,
14042
+ "loss": 3.3925,
14043
+ "step": 94200
14044
+ },
14045
+ {
14046
+ "epoch": 10.161725067385445,
14047
+ "grad_norm": 0.9071498513221741,
14048
+ "learning_rate": 0.0004782115820122937,
14049
+ "loss": 3.3714,
14050
+ "step": 94250
14051
+ },
14052
+ {
14053
+ "epoch": 10.167115902964959,
14054
+ "grad_norm": 0.868298351764679,
14055
+ "learning_rate": 0.000478146878032999,
14056
+ "loss": 3.3912,
14057
+ "step": 94300
14058
+ },
14059
+ {
14060
+ "epoch": 10.172506738544474,
14061
+ "grad_norm": 0.8858316540718079,
14062
+ "learning_rate": 0.00047808217405370424,
14063
+ "loss": 3.363,
14064
+ "step": 94350
14065
+ },
14066
+ {
14067
+ "epoch": 10.177897574123989,
14068
+ "grad_norm": 0.820442259311676,
14069
+ "learning_rate": 0.00047801747007440953,
14070
+ "loss": 3.3963,
14071
+ "step": 94400
14072
+ },
14073
+ {
14074
+ "epoch": 10.183288409703504,
14075
+ "grad_norm": 0.9085699319839478,
14076
+ "learning_rate": 0.00047795276609511483,
14077
+ "loss": 3.3966,
14078
+ "step": 94450
14079
+ },
14080
+ {
14081
+ "epoch": 10.18867924528302,
14082
+ "grad_norm": 0.8699147701263428,
14083
+ "learning_rate": 0.00047788806211582007,
14084
+ "loss": 3.3895,
14085
+ "step": 94500
14086
+ },
14087
+ {
14088
+ "epoch": 10.194070080862534,
14089
+ "grad_norm": 0.8835978507995605,
14090
+ "learning_rate": 0.00047782335813652537,
14091
+ "loss": 3.3917,
14092
+ "step": 94550
14093
+ },
14094
+ {
14095
+ "epoch": 10.199460916442048,
14096
+ "grad_norm": 0.8378937244415283,
14097
+ "learning_rate": 0.0004777586541572306,
14098
+ "loss": 3.3785,
14099
+ "step": 94600
14100
+ },
14101
+ {
14102
+ "epoch": 10.204851752021563,
14103
+ "grad_norm": 0.8834420442581177,
14104
+ "learning_rate": 0.0004776939501779359,
14105
+ "loss": 3.3843,
14106
+ "step": 94650
14107
+ },
14108
+ {
14109
+ "epoch": 10.210242587601078,
14110
+ "grad_norm": 0.9476656913757324,
14111
+ "learning_rate": 0.0004776292461986412,
14112
+ "loss": 3.394,
14113
+ "step": 94700
14114
+ },
14115
+ {
14116
+ "epoch": 10.215633423180593,
14117
+ "grad_norm": 0.8962563872337341,
14118
+ "learning_rate": 0.00047756454221934645,
14119
+ "loss": 3.3882,
14120
+ "step": 94750
14121
+ },
14122
+ {
14123
+ "epoch": 10.221024258760108,
14124
+ "grad_norm": 0.8866973519325256,
14125
+ "learning_rate": 0.0004774998382400517,
14126
+ "loss": 3.3908,
14127
+ "step": 94800
14128
+ },
14129
+ {
14130
+ "epoch": 10.226415094339623,
14131
+ "grad_norm": 0.8504216074943542,
14132
+ "learning_rate": 0.00047743513426075704,
14133
+ "loss": 3.3871,
14134
+ "step": 94850
14135
+ },
14136
+ {
14137
+ "epoch": 10.231805929919137,
14138
+ "grad_norm": 0.8385003805160522,
14139
+ "learning_rate": 0.0004773704302814623,
14140
+ "loss": 3.3962,
14141
+ "step": 94900
14142
+ },
14143
+ {
14144
+ "epoch": 10.237196765498652,
14145
+ "grad_norm": 0.8448400497436523,
14146
+ "learning_rate": 0.0004773057263021675,
14147
+ "loss": 3.4058,
14148
+ "step": 94950
14149
+ },
14150
+ {
14151
+ "epoch": 10.242587601078167,
14152
+ "grad_norm": 0.8619194626808167,
14153
+ "learning_rate": 0.0004772410223228728,
14154
+ "loss": 3.3884,
14155
+ "step": 95000
14156
+ },
14157
+ {
14158
+ "epoch": 10.242587601078167,
14159
+ "eval_accuracy": 0.37598745102606895,
14160
+ "eval_loss": 3.4676356315612793,
14161
+ "eval_runtime": 146.923,
14162
+ "eval_samples_per_second": 122.588,
14163
+ "eval_steps_per_second": 7.664,
14164
+ "step": 95000
14165
+ },
14166
+ {
14167
+ "epoch": 10.242587601078167,
14168
+ "step": 95000,
14169
+ "total_flos": 7.94262454272e+17,
14170
+ "train_loss": 0.07955799303556743,
14171
+ "train_runtime": 1634.6435,
14172
+ "train_samples_per_second": 9077.667,
14173
+ "train_steps_per_second": 283.701
14174
  }
14175
  ],
14176
  "logging_steps": 50,
14177
+ "max_steps": 463750,
14178
  "num_input_tokens_seen": 0,
14179
+ "num_train_epochs": 50,
14180
  "save_steps": 10000,
14181
  "stateful_callbacks": {
14182
  "TrainerControl": {
 
14190
  "attributes": {}
14191
  }
14192
  },
14193
+ "total_flos": 7.94262454272e+17,
14194
  "train_batch_size": 32,
14195
  "trial_name": null,
14196
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7d24f4ca1b52c9db2ed02b327a68a7c52c020ab559dd4b89a1344f00db4d14cb
3
  size 5304
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7a4f9fe5e6c6a0f0bfabe88acf34319237a3bdae6f2c87408262f5977a18ecd
3
  size 5304