shulijia commited on
Commit
71ae86a
·
verified ·
1 Parent(s): 856f306

Training in progress, step 6000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7187f070bae9f728ae7049ac2993dc4453f90c49119d18bdbd8b64447c6d997e
3
  size 2384234968
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a51051b34edc25e0d64c30ca7f33b83a0761c13d9072778abfd10448c19f3f5e
3
  size 2384234968
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:223e221ba487127dc4e54dca78f99736852f1b0b40562c23f04e0259e427bcd7
3
  size 4768663315
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f3b41dca90dac11ca9ea5d9671e951dd4dabbed860e4fb7230df978f7f0f912
3
  size 4768663315
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0b1d2665185a3f2f39cba5fc2105358b29b2d7d0e055734db8e9a540adc3d4b5
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf6efc206c57420111096224e0f9dccda6dffe818f96d885666ae79f4ad31671
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.6562758828401476,
6
  "eval_steps": 100,
7
- "global_step": 5500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -4958,6 +4958,456 @@
4958
  "mean_token_accuracy": 0.7709882587194443,
4959
  "num_tokens": 45049856.0,
4960
  "step": 5500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4961
  }
4962
  ],
4963
  "logging_steps": 10,
@@ -4977,7 +5427,7 @@
4977
  "attributes": {}
4978
  }
4979
  },
4980
- "total_flos": 1.1905797279291802e+17,
4981
  "train_batch_size": 2,
4982
  "trial_name": null,
4983
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.8068669527896994,
6
  "eval_steps": 100,
7
+ "global_step": 6000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
4958
  "mean_token_accuracy": 0.7709882587194443,
4959
  "num_tokens": 45049856.0,
4960
  "step": 5500
4961
+ },
4962
+ {
4963
+ "epoch": 1.6592877042391385,
4964
+ "grad_norm": 0.7701192498207092,
4965
+ "learning_rate": 4.967655587776044e-06,
4966
+ "loss": 0.1216,
4967
+ "mean_token_accuracy": 0.7994618371129036,
4968
+ "num_tokens": 45131776.0,
4969
+ "step": 5510
4970
+ },
4971
+ {
4972
+ "epoch": 1.6622995256381297,
4973
+ "grad_norm": 1.0482795238494873,
4974
+ "learning_rate": 4.956502342181575e-06,
4975
+ "loss": 0.1661,
4976
+ "mean_token_accuracy": 0.7507583145052195,
4977
+ "num_tokens": 45213696.0,
4978
+ "step": 5520
4979
+ },
4980
+ {
4981
+ "epoch": 1.6653113470371208,
4982
+ "grad_norm": 1.0718356370925903,
4983
+ "learning_rate": 4.9453490965871075e-06,
4984
+ "loss": 0.1351,
4985
+ "mean_token_accuracy": 0.8001345373690129,
4986
+ "num_tokens": 45295616.0,
4987
+ "step": 5530
4988
+ },
4989
+ {
4990
+ "epoch": 1.6683231684361117,
4991
+ "grad_norm": 0.9752377271652222,
4992
+ "learning_rate": 4.934195850992639e-06,
4993
+ "loss": 0.1215,
4994
+ "mean_token_accuracy": 0.7919031277298927,
4995
+ "num_tokens": 45377536.0,
4996
+ "step": 5540
4997
+ },
4998
+ {
4999
+ "epoch": 1.6713349898351026,
5000
+ "grad_norm": 1.1375031471252441,
5001
+ "learning_rate": 4.923042605398171e-06,
5002
+ "loss": 0.1395,
5003
+ "mean_token_accuracy": 0.7713551849126816,
5004
+ "num_tokens": 45459456.0,
5005
+ "step": 5550
5006
+ },
5007
+ {
5008
+ "epoch": 1.6743468112340938,
5009
+ "grad_norm": 1.122489094734192,
5010
+ "learning_rate": 4.911889359803703e-06,
5011
+ "loss": 0.1349,
5012
+ "mean_token_accuracy": 0.7769814103841781,
5013
+ "num_tokens": 45541376.0,
5014
+ "step": 5560
5015
+ },
5016
+ {
5017
+ "epoch": 1.677358632633085,
5018
+ "grad_norm": 1.0552014112472534,
5019
+ "learning_rate": 4.900736114209236e-06,
5020
+ "loss": 0.1194,
5021
+ "mean_token_accuracy": 0.7823997054249048,
5022
+ "num_tokens": 45623296.0,
5023
+ "step": 5570
5024
+ },
5025
+ {
5026
+ "epoch": 1.6803704540320759,
5027
+ "grad_norm": 1.4772262573242188,
5028
+ "learning_rate": 4.889582868614767e-06,
5029
+ "loss": 0.1432,
5030
+ "mean_token_accuracy": 0.7968811176717281,
5031
+ "num_tokens": 45705216.0,
5032
+ "step": 5580
5033
+ },
5034
+ {
5035
+ "epoch": 1.6833822754310668,
5036
+ "grad_norm": 1.0504951477050781,
5037
+ "learning_rate": 4.878429623020299e-06,
5038
+ "loss": 0.1534,
5039
+ "mean_token_accuracy": 0.7667196691036224,
5040
+ "num_tokens": 45787136.0,
5041
+ "step": 5590
5042
+ },
5043
+ {
5044
+ "epoch": 1.6863940968300581,
5045
+ "grad_norm": 1.0767134428024292,
5046
+ "learning_rate": 4.8672763774258316e-06,
5047
+ "loss": 0.1384,
5048
+ "mean_token_accuracy": 0.7868884541094303,
5049
+ "num_tokens": 45869056.0,
5050
+ "step": 5600
5051
+ },
5052
+ {
5053
+ "epoch": 1.689405918229049,
5054
+ "grad_norm": 1.119947910308838,
5055
+ "learning_rate": 4.856123131831364e-06,
5056
+ "loss": 0.1249,
5057
+ "mean_token_accuracy": 0.7958781808614731,
5058
+ "num_tokens": 45950976.0,
5059
+ "step": 5610
5060
+ },
5061
+ {
5062
+ "epoch": 1.69241773962804,
5063
+ "grad_norm": 1.1654057502746582,
5064
+ "learning_rate": 4.844969886236895e-06,
5065
+ "loss": 0.1418,
5066
+ "mean_token_accuracy": 0.7592465754598379,
5067
+ "num_tokens": 46032896.0,
5068
+ "step": 5620
5069
+ },
5070
+ {
5071
+ "epoch": 1.6954295610270311,
5072
+ "grad_norm": 1.3118687868118286,
5073
+ "learning_rate": 4.8338166406424275e-06,
5074
+ "loss": 0.1268,
5075
+ "mean_token_accuracy": 0.7850048918277025,
5076
+ "num_tokens": 46114816.0,
5077
+ "step": 5630
5078
+ },
5079
+ {
5080
+ "epoch": 1.6984413824260223,
5081
+ "grad_norm": 0.907384991645813,
5082
+ "learning_rate": 4.82266339504796e-06,
5083
+ "loss": 0.1068,
5084
+ "mean_token_accuracy": 0.8026663415133953,
5085
+ "num_tokens": 46196736.0,
5086
+ "step": 5640
5087
+ },
5088
+ {
5089
+ "epoch": 1.7014532038250132,
5090
+ "grad_norm": 1.247758388519287,
5091
+ "learning_rate": 4.811510149453492e-06,
5092
+ "loss": 0.1181,
5093
+ "mean_token_accuracy": 0.784491191059351,
5094
+ "num_tokens": 46278656.0,
5095
+ "step": 5650
5096
+ },
5097
+ {
5098
+ "epoch": 1.7044650252240041,
5099
+ "grad_norm": 1.2206517457962036,
5100
+ "learning_rate": 4.800356903859023e-06,
5101
+ "loss": 0.1263,
5102
+ "mean_token_accuracy": 0.7789261259138585,
5103
+ "num_tokens": 46360576.0,
5104
+ "step": 5660
5105
+ },
5106
+ {
5107
+ "epoch": 1.7074768466229953,
5108
+ "grad_norm": 1.4693553447723389,
5109
+ "learning_rate": 4.789203658264555e-06,
5110
+ "loss": 0.1288,
5111
+ "mean_token_accuracy": 0.7743395268917084,
5112
+ "num_tokens": 46442496.0,
5113
+ "step": 5670
5114
+ },
5115
+ {
5116
+ "epoch": 1.7104886680219864,
5117
+ "grad_norm": 1.5424615144729614,
5118
+ "learning_rate": 4.778050412670087e-06,
5119
+ "loss": 0.1205,
5120
+ "mean_token_accuracy": 0.7858732886612415,
5121
+ "num_tokens": 46524416.0,
5122
+ "step": 5680
5123
+ },
5124
+ {
5125
+ "epoch": 1.7135004894209773,
5126
+ "grad_norm": 1.4653291702270508,
5127
+ "learning_rate": 4.766897167075619e-06,
5128
+ "loss": 0.1466,
5129
+ "mean_token_accuracy": 0.7676614470779896,
5130
+ "num_tokens": 46606336.0,
5131
+ "step": 5690
5132
+ },
5133
+ {
5134
+ "epoch": 1.7165123108199682,
5135
+ "grad_norm": 1.5882482528686523,
5136
+ "learning_rate": 4.755743921481152e-06,
5137
+ "loss": 0.1065,
5138
+ "mean_token_accuracy": 0.7988013669848442,
5139
+ "num_tokens": 46688256.0,
5140
+ "step": 5700
5141
+ },
5142
+ {
5143
+ "epoch": 1.7195241322189594,
5144
+ "grad_norm": 1.2498167753219604,
5145
+ "learning_rate": 4.744590675886683e-06,
5146
+ "loss": 0.1202,
5147
+ "mean_token_accuracy": 0.7973214272409678,
5148
+ "num_tokens": 46770176.0,
5149
+ "step": 5710
5150
+ },
5151
+ {
5152
+ "epoch": 1.7225359536179505,
5153
+ "grad_norm": 1.5788949728012085,
5154
+ "learning_rate": 4.733437430292215e-06,
5155
+ "loss": 0.102,
5156
+ "mean_token_accuracy": 0.7933341491967439,
5157
+ "num_tokens": 46852096.0,
5158
+ "step": 5720
5159
+ },
5160
+ {
5161
+ "epoch": 1.7255477750169415,
5162
+ "grad_norm": 1.5957988500595093,
5163
+ "learning_rate": 4.7222841846977475e-06,
5164
+ "loss": 0.1407,
5165
+ "mean_token_accuracy": 0.7833292562514543,
5166
+ "num_tokens": 46934016.0,
5167
+ "step": 5730
5168
+ },
5169
+ {
5170
+ "epoch": 1.7285595964159324,
5171
+ "grad_norm": 1.243113398551941,
5172
+ "learning_rate": 4.71113093910328e-06,
5173
+ "loss": 0.1184,
5174
+ "mean_token_accuracy": 0.7828277878463268,
5175
+ "num_tokens": 47015936.0,
5176
+ "step": 5740
5177
+ },
5178
+ {
5179
+ "epoch": 1.7315714178149237,
5180
+ "grad_norm": 1.1513034105300903,
5181
+ "learning_rate": 4.699977693508811e-06,
5182
+ "loss": 0.1371,
5183
+ "mean_token_accuracy": 0.7631604719907046,
5184
+ "num_tokens": 47097856.0,
5185
+ "step": 5750
5186
+ },
5187
+ {
5188
+ "epoch": 1.7345832392139147,
5189
+ "grad_norm": 1.3796018362045288,
5190
+ "learning_rate": 4.6888244479143434e-06,
5191
+ "loss": 0.1624,
5192
+ "mean_token_accuracy": 0.7756115447729826,
5193
+ "num_tokens": 47179776.0,
5194
+ "step": 5760
5195
+ },
5196
+ {
5197
+ "epoch": 1.7375950606129056,
5198
+ "grad_norm": 1.6037061214447021,
5199
+ "learning_rate": 4.677671202319876e-06,
5200
+ "loss": 0.135,
5201
+ "mean_token_accuracy": 0.7826076343655586,
5202
+ "num_tokens": 47261696.0,
5203
+ "step": 5770
5204
+ },
5205
+ {
5206
+ "epoch": 1.7406068820118967,
5207
+ "grad_norm": 1.030444860458374,
5208
+ "learning_rate": 4.666517956725408e-06,
5209
+ "loss": 0.1303,
5210
+ "mean_token_accuracy": 0.7866193737834692,
5211
+ "num_tokens": 47343616.0,
5212
+ "step": 5780
5213
+ },
5214
+ {
5215
+ "epoch": 1.7436187034108879,
5216
+ "grad_norm": 1.1653821468353271,
5217
+ "learning_rate": 4.655364711130939e-06,
5218
+ "loss": 0.1352,
5219
+ "mean_token_accuracy": 0.7857142832130194,
5220
+ "num_tokens": 47425536.0,
5221
+ "step": 5790
5222
+ },
5223
+ {
5224
+ "epoch": 1.7466305248098788,
5225
+ "grad_norm": 1.128597617149353,
5226
+ "learning_rate": 4.644211465536472e-06,
5227
+ "loss": 0.1329,
5228
+ "mean_token_accuracy": 0.7797822903841733,
5229
+ "num_tokens": 47507456.0,
5230
+ "step": 5800
5231
+ },
5232
+ {
5233
+ "epoch": 1.7496423462088697,
5234
+ "grad_norm": 1.2128268480300903,
5235
+ "learning_rate": 4.633058219942004e-06,
5236
+ "loss": 0.1336,
5237
+ "mean_token_accuracy": 0.7869496114552021,
5238
+ "num_tokens": 47589376.0,
5239
+ "step": 5810
5240
+ },
5241
+ {
5242
+ "epoch": 1.7526541676078609,
5243
+ "grad_norm": 1.5500558614730835,
5244
+ "learning_rate": 4.621904974347535e-06,
5245
+ "loss": 0.1571,
5246
+ "mean_token_accuracy": 0.7766511704772711,
5247
+ "num_tokens": 47671296.0,
5248
+ "step": 5820
5249
+ },
5250
+ {
5251
+ "epoch": 1.755665989006852,
5252
+ "grad_norm": 1.3147218227386475,
5253
+ "learning_rate": 4.6107517287530675e-06,
5254
+ "loss": 0.1131,
5255
+ "mean_token_accuracy": 0.7829745605587959,
5256
+ "num_tokens": 47753216.0,
5257
+ "step": 5830
5258
+ },
5259
+ {
5260
+ "epoch": 1.758677810405843,
5261
+ "grad_norm": 1.2580207586288452,
5262
+ "learning_rate": 4.599598483158599e-06,
5263
+ "loss": 0.1107,
5264
+ "mean_token_accuracy": 0.7984466716647148,
5265
+ "num_tokens": 47835136.0,
5266
+ "step": 5840
5267
+ },
5268
+ {
5269
+ "epoch": 1.7616896318048338,
5270
+ "grad_norm": 1.2919446229934692,
5271
+ "learning_rate": 4.588445237564131e-06,
5272
+ "loss": 0.1122,
5273
+ "mean_token_accuracy": 0.7956947181373835,
5274
+ "num_tokens": 47917056.0,
5275
+ "step": 5850
5276
+ },
5277
+ {
5278
+ "epoch": 1.764701453203825,
5279
+ "grad_norm": 1.4461500644683838,
5280
+ "learning_rate": 4.5772919919696635e-06,
5281
+ "loss": 0.1569,
5282
+ "mean_token_accuracy": 0.7789261236786842,
5283
+ "num_tokens": 47998976.0,
5284
+ "step": 5860
5285
+ },
5286
+ {
5287
+ "epoch": 1.7677132746028161,
5288
+ "grad_norm": 0.9612492918968201,
5289
+ "learning_rate": 4.566138746375196e-06,
5290
+ "loss": 0.1155,
5291
+ "mean_token_accuracy": 0.800122307986021,
5292
+ "num_tokens": 48080896.0,
5293
+ "step": 5870
5294
+ },
5295
+ {
5296
+ "epoch": 1.770725096001807,
5297
+ "grad_norm": 1.1701524257659912,
5298
+ "learning_rate": 4.554985500780727e-06,
5299
+ "loss": 0.1268,
5300
+ "mean_token_accuracy": 0.7784735832363368,
5301
+ "num_tokens": 48162816.0,
5302
+ "step": 5880
5303
+ },
5304
+ {
5305
+ "epoch": 1.773736917400798,
5306
+ "grad_norm": 0.8847672343254089,
5307
+ "learning_rate": 4.543832255186259e-06,
5308
+ "loss": 0.1395,
5309
+ "mean_token_accuracy": 0.771404106169939,
5310
+ "num_tokens": 48244736.0,
5311
+ "step": 5890
5312
+ },
5313
+ {
5314
+ "epoch": 1.7767487387997891,
5315
+ "grad_norm": 1.2543238401412964,
5316
+ "learning_rate": 4.532679009591792e-06,
5317
+ "loss": 0.1293,
5318
+ "mean_token_accuracy": 0.7882093921303749,
5319
+ "num_tokens": 48326656.0,
5320
+ "step": 5900
5321
+ },
5322
+ {
5323
+ "epoch": 1.7797605601987803,
5324
+ "grad_norm": 1.0476740598678589,
5325
+ "learning_rate": 4.521525763997324e-06,
5326
+ "loss": 0.1401,
5327
+ "mean_token_accuracy": 0.7806506846100092,
5328
+ "num_tokens": 48408576.0,
5329
+ "step": 5910
5330
+ },
5331
+ {
5332
+ "epoch": 1.7827723815977712,
5333
+ "grad_norm": 0.9780436754226685,
5334
+ "learning_rate": 4.510372518402855e-06,
5335
+ "loss": 0.128,
5336
+ "mean_token_accuracy": 0.7880748540163041,
5337
+ "num_tokens": 48490496.0,
5338
+ "step": 5920
5339
+ },
5340
+ {
5341
+ "epoch": 1.7857842029967623,
5342
+ "grad_norm": 1.645817756652832,
5343
+ "learning_rate": 4.4992192728083876e-06,
5344
+ "loss": 0.1207,
5345
+ "mean_token_accuracy": 0.7967710334807634,
5346
+ "num_tokens": 48572416.0,
5347
+ "step": 5930
5348
+ },
5349
+ {
5350
+ "epoch": 1.7887960243957535,
5351
+ "grad_norm": 1.1234519481658936,
5352
+ "learning_rate": 4.48806602721392e-06,
5353
+ "loss": 0.1117,
5354
+ "mean_token_accuracy": 0.7863380637019872,
5355
+ "num_tokens": 48654336.0,
5356
+ "step": 5940
5357
+ },
5358
+ {
5359
+ "epoch": 1.7918078457947444,
5360
+ "grad_norm": 1.2160331010818481,
5361
+ "learning_rate": 4.476912781619452e-06,
5362
+ "loss": 0.1089,
5363
+ "mean_token_accuracy": 0.7874755386263133,
5364
+ "num_tokens": 48736256.0,
5365
+ "step": 5950
5366
+ },
5367
+ {
5368
+ "epoch": 1.7948196671937353,
5369
+ "grad_norm": 1.0973700284957886,
5370
+ "learning_rate": 4.4657595360249835e-06,
5371
+ "loss": 0.1106,
5372
+ "mean_token_accuracy": 0.7923556726425887,
5373
+ "num_tokens": 48818176.0,
5374
+ "step": 5960
5375
+ },
5376
+ {
5377
+ "epoch": 1.7978314885927265,
5378
+ "grad_norm": 1.1763473749160767,
5379
+ "learning_rate": 4.454606290430516e-06,
5380
+ "loss": 0.1464,
5381
+ "mean_token_accuracy": 0.7801859095692635,
5382
+ "num_tokens": 48900096.0,
5383
+ "step": 5970
5384
+ },
5385
+ {
5386
+ "epoch": 1.8008433099917176,
5387
+ "grad_norm": 1.0403779745101929,
5388
+ "learning_rate": 4.443453044836047e-06,
5389
+ "loss": 0.1553,
5390
+ "mean_token_accuracy": 0.7728351287543773,
5391
+ "num_tokens": 48982016.0,
5392
+ "step": 5980
5393
+ },
5394
+ {
5395
+ "epoch": 1.8038551313907085,
5396
+ "grad_norm": 1.446248173713684,
5397
+ "learning_rate": 4.4322997992415794e-06,
5398
+ "loss": 0.1393,
5399
+ "mean_token_accuracy": 0.783953033387661,
5400
+ "num_tokens": 49063936.0,
5401
+ "step": 5990
5402
+ },
5403
+ {
5404
+ "epoch": 1.8068669527896994,
5405
+ "grad_norm": 1.383080005645752,
5406
+ "learning_rate": 4.421146553647112e-06,
5407
+ "loss": 0.134,
5408
+ "mean_token_accuracy": 0.7743395321071148,
5409
+ "num_tokens": 49145856.0,
5410
+ "step": 6000
5411
  }
5412
  ],
5413
  "logging_steps": 10,
 
5427
  "attributes": {}
5428
  }
5429
  },
5430
+ "total_flos": 1.2988290099157402e+17,
5431
  "train_batch_size": 2,
5432
  "trial_name": null,
5433
  "trial_params": null