minpeter commited on
Commit
d2e3dad
·
verified ·
1 Parent(s): 8856462

Training in progress, step 800, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aa231a4fb18485169d08c9d1e7878f2c6c2747cf33272ebb7b91a615a73da69f
3
  size 373077376
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd0e106749ec154eecd3ebb9fe7474cf3444291df427cce5d1d61cb2679e8088
3
  size 373077376
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ff95fd30c41364a06356f6550493cfc79f8b5f14e8279f05b156b0d50603cfb7
3
  size 422377675
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e6982c3f836b1c5917d64c3c2c07418fb1042a731f48fa53830cf50384b985a7
3
  size 422377675
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:36f1c8cafda7ec05bcf717e4cbc9d475e180378b36391598d72523001d0947ee
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2933fa623da5d83a2ffe4eddaad982ac15c82f5c890445e228adf894e89f9290
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b636e22decc0690abb4217d3b016f329ae73b4d12bae4602c74bba0c4d4ffdc1
3
  size 1401
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4958966c61d8eed22eb0bdc6e0a1efc61ae912a801cb91fc7888b1951205081b
3
  size 1401
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.726895119418484,
6
  "eval_steps": 100,
7
- "global_step": 700,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -4964,6 +4964,714 @@
4964
  "eval_samples_per_second": 9.72,
4965
  "eval_steps_per_second": 1.215,
4966
  "step": 700
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4967
  }
4968
  ],
4969
  "logging_steps": 1,
@@ -4983,7 +5691,7 @@
4983
  "attributes": {}
4984
  }
4985
  },
4986
- "total_flos": 8.91532735414272e+16,
4987
  "train_batch_size": 16,
4988
  "trial_name": null,
4989
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.8307372793354102,
6
  "eval_steps": 100,
7
+ "global_step": 800,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
4964
  "eval_samples_per_second": 9.72,
4965
  "eval_steps_per_second": 1.215,
4966
  "step": 700
4967
+ },
4968
+ {
4969
+ "epoch": 0.7279335410176532,
4970
+ "grad_norm": 1.7109375,
4971
+ "learning_rate": 0.00019075678737450686,
4972
+ "loss": 6.1547,
4973
+ "step": 701
4974
+ },
4975
+ {
4976
+ "epoch": 0.7289719626168224,
4977
+ "grad_norm": 1.7734375,
4978
+ "learning_rate": 0.00018940815215922607,
4979
+ "loss": 6.0599,
4980
+ "step": 702
4981
+ },
4982
+ {
4983
+ "epoch": 0.7300103842159917,
4984
+ "grad_norm": 1.6171875,
4985
+ "learning_rate": 0.00018806318636018665,
4986
+ "loss": 6.2195,
4987
+ "step": 703
4988
+ },
4989
+ {
4990
+ "epoch": 0.731048805815161,
4991
+ "grad_norm": 1.46875,
4992
+ "learning_rate": 0.00018672190586717908,
4993
+ "loss": 6.4289,
4994
+ "step": 704
4995
+ },
4996
+ {
4997
+ "epoch": 0.7320872274143302,
4998
+ "grad_norm": 1.203125,
4999
+ "learning_rate": 0.00018538432652645437,
5000
+ "loss": 6.2451,
5001
+ "step": 705
5002
+ },
5003
+ {
5004
+ "epoch": 0.7331256490134995,
5005
+ "grad_norm": 1.1640625,
5006
+ "learning_rate": 0.00018405046414053728,
5007
+ "loss": 6.2281,
5008
+ "step": 706
5009
+ },
5010
+ {
5011
+ "epoch": 0.7341640706126688,
5012
+ "grad_norm": 1.484375,
5013
+ "learning_rate": 0.00018272033446803949,
5014
+ "loss": 6.2168,
5015
+ "step": 707
5016
+ },
5017
+ {
5018
+ "epoch": 0.735202492211838,
5019
+ "grad_norm": 1.7890625,
5020
+ "learning_rate": 0.00018139395322347334,
5021
+ "loss": 6.1276,
5022
+ "step": 708
5023
+ },
5024
+ {
5025
+ "epoch": 0.7362409138110073,
5026
+ "grad_norm": 1.4453125,
5027
+ "learning_rate": 0.00018007133607706615,
5028
+ "loss": 6.0438,
5029
+ "step": 709
5030
+ },
5031
+ {
5032
+ "epoch": 0.7372793354101765,
5033
+ "grad_norm": 1.875,
5034
+ "learning_rate": 0.00017875249865457527,
5035
+ "loss": 6.0624,
5036
+ "step": 710
5037
+ },
5038
+ {
5039
+ "epoch": 0.7383177570093458,
5040
+ "grad_norm": 1.671875,
5041
+ "learning_rate": 0.00017743745653710336,
5042
+ "loss": 6.4648,
5043
+ "step": 711
5044
+ },
5045
+ {
5046
+ "epoch": 0.7393561786085151,
5047
+ "grad_norm": 1.4375,
5048
+ "learning_rate": 0.00017612622526091403,
5049
+ "loss": 6.2431,
5050
+ "step": 712
5051
+ },
5052
+ {
5053
+ "epoch": 0.7403946002076843,
5054
+ "grad_norm": 1.375,
5055
+ "learning_rate": 0.00017481882031724927,
5056
+ "loss": 6.3375,
5057
+ "step": 713
5058
+ },
5059
+ {
5060
+ "epoch": 0.7414330218068536,
5061
+ "grad_norm": 1.703125,
5062
+ "learning_rate": 0.0001735152571521451,
5063
+ "loss": 6.3156,
5064
+ "step": 714
5065
+ },
5066
+ {
5067
+ "epoch": 0.7424714434060229,
5068
+ "grad_norm": 1.1484375,
5069
+ "learning_rate": 0.00017221555116625,
5070
+ "loss": 6.2417,
5071
+ "step": 715
5072
+ },
5073
+ {
5074
+ "epoch": 0.7435098650051921,
5075
+ "grad_norm": 1.09375,
5076
+ "learning_rate": 0.0001709197177146425,
5077
+ "loss": 6.3143,
5078
+ "step": 716
5079
+ },
5080
+ {
5081
+ "epoch": 0.7445482866043613,
5082
+ "grad_norm": 1.34375,
5083
+ "learning_rate": 0.0001696277721066502,
5084
+ "loss": 6.1396,
5085
+ "step": 717
5086
+ },
5087
+ {
5088
+ "epoch": 0.7455867082035307,
5089
+ "grad_norm": 1.2265625,
5090
+ "learning_rate": 0.00016833972960566868,
5091
+ "loss": 6.3164,
5092
+ "step": 718
5093
+ },
5094
+ {
5095
+ "epoch": 0.7466251298026999,
5096
+ "grad_norm": 1.6484375,
5097
+ "learning_rate": 0.00016705560542898051,
5098
+ "loss": 6.2559,
5099
+ "step": 719
5100
+ },
5101
+ {
5102
+ "epoch": 0.7476635514018691,
5103
+ "grad_norm": 1.5234375,
5104
+ "learning_rate": 0.00016577541474757713,
5105
+ "loss": 6.4104,
5106
+ "step": 720
5107
+ },
5108
+ {
5109
+ "epoch": 0.7487019730010385,
5110
+ "grad_norm": 1.421875,
5111
+ "learning_rate": 0.00016449917268597798,
5112
+ "loss": 6.0631,
5113
+ "step": 721
5114
+ },
5115
+ {
5116
+ "epoch": 0.7497403946002077,
5117
+ "grad_norm": 1.4140625,
5118
+ "learning_rate": 0.00016322689432205252,
5119
+ "loss": 6.3299,
5120
+ "step": 722
5121
+ },
5122
+ {
5123
+ "epoch": 0.7507788161993769,
5124
+ "grad_norm": 1.5546875,
5125
+ "learning_rate": 0.00016195859468684198,
5126
+ "loss": 6.2053,
5127
+ "step": 723
5128
+ },
5129
+ {
5130
+ "epoch": 0.7518172377985463,
5131
+ "grad_norm": 1.5546875,
5132
+ "learning_rate": 0.00016069428876438202,
5133
+ "loss": 6.3051,
5134
+ "step": 724
5135
+ },
5136
+ {
5137
+ "epoch": 0.7528556593977155,
5138
+ "grad_norm": 1.359375,
5139
+ "learning_rate": 0.00015943399149152533,
5140
+ "loss": 6.1548,
5141
+ "step": 725
5142
+ },
5143
+ {
5144
+ "epoch": 0.7538940809968847,
5145
+ "grad_norm": 1.375,
5146
+ "learning_rate": 0.00015817771775776507,
5147
+ "loss": 6.2009,
5148
+ "step": 726
5149
+ },
5150
+ {
5151
+ "epoch": 0.754932502596054,
5152
+ "grad_norm": 1.3046875,
5153
+ "learning_rate": 0.00015692548240506,
5154
+ "loss": 6.2028,
5155
+ "step": 727
5156
+ },
5157
+ {
5158
+ "epoch": 0.7559709241952233,
5159
+ "grad_norm": 1.25,
5160
+ "learning_rate": 0.00015567730022765752,
5161
+ "loss": 5.9373,
5162
+ "step": 728
5163
+ },
5164
+ {
5165
+ "epoch": 0.7570093457943925,
5166
+ "grad_norm": 1.203125,
5167
+ "learning_rate": 0.0001544331859719202,
5168
+ "loss": 6.1761,
5169
+ "step": 729
5170
+ },
5171
+ {
5172
+ "epoch": 0.7580477673935618,
5173
+ "grad_norm": 1.484375,
5174
+ "learning_rate": 0.000153193154336151,
5175
+ "loss": 6.2549,
5176
+ "step": 730
5177
+ },
5178
+ {
5179
+ "epoch": 0.7590861889927311,
5180
+ "grad_norm": 1.609375,
5181
+ "learning_rate": 0.00015195721997041933,
5182
+ "loss": 6.1982,
5183
+ "step": 731
5184
+ },
5185
+ {
5186
+ "epoch": 0.7601246105919003,
5187
+ "grad_norm": 1.59375,
5188
+ "learning_rate": 0.00015072539747638887,
5189
+ "loss": 6.2346,
5190
+ "step": 732
5191
+ },
5192
+ {
5193
+ "epoch": 0.7611630321910696,
5194
+ "grad_norm": 1.765625,
5195
+ "learning_rate": 0.00014949770140714407,
5196
+ "loss": 5.5064,
5197
+ "step": 733
5198
+ },
5199
+ {
5200
+ "epoch": 0.7622014537902388,
5201
+ "grad_norm": 1.296875,
5202
+ "learning_rate": 0.0001482741462670193,
5203
+ "loss": 6.3146,
5204
+ "step": 734
5205
+ },
5206
+ {
5207
+ "epoch": 0.7632398753894081,
5208
+ "grad_norm": 1.578125,
5209
+ "learning_rate": 0.0001470547465114263,
5210
+ "loss": 6.2277,
5211
+ "step": 735
5212
+ },
5213
+ {
5214
+ "epoch": 0.7642782969885774,
5215
+ "grad_norm": 1.328125,
5216
+ "learning_rate": 0.00014583951654668415,
5217
+ "loss": 6.3032,
5218
+ "step": 736
5219
+ },
5220
+ {
5221
+ "epoch": 0.7653167185877466,
5222
+ "grad_norm": 1.6484375,
5223
+ "learning_rate": 0.00014462847072984898,
5224
+ "loss": 5.9154,
5225
+ "step": 737
5226
+ },
5227
+ {
5228
+ "epoch": 0.7663551401869159,
5229
+ "grad_norm": 1.6484375,
5230
+ "learning_rate": 0.0001434216233685441,
5231
+ "loss": 6.0951,
5232
+ "step": 738
5233
+ },
5234
+ {
5235
+ "epoch": 0.7673935617860852,
5236
+ "grad_norm": 1.578125,
5237
+ "learning_rate": 0.00014221898872079108,
5238
+ "loss": 6.0921,
5239
+ "step": 739
5240
+ },
5241
+ {
5242
+ "epoch": 0.7684319833852544,
5243
+ "grad_norm": 1.5703125,
5244
+ "learning_rate": 0.0001410205809948419,
5245
+ "loss": 6.2295,
5246
+ "step": 740
5247
+ },
5248
+ {
5249
+ "epoch": 0.7694704049844237,
5250
+ "grad_norm": 1.3515625,
5251
+ "learning_rate": 0.00013982641434900984,
5252
+ "loss": 6.229,
5253
+ "step": 741
5254
+ },
5255
+ {
5256
+ "epoch": 0.770508826583593,
5257
+ "grad_norm": 1.5,
5258
+ "learning_rate": 0.00013863650289150338,
5259
+ "loss": 6.3173,
5260
+ "step": 742
5261
+ },
5262
+ {
5263
+ "epoch": 0.7715472481827622,
5264
+ "grad_norm": 1.515625,
5265
+ "learning_rate": 0.00013745086068025857,
5266
+ "loss": 6.3666,
5267
+ "step": 743
5268
+ },
5269
+ {
5270
+ "epoch": 0.7725856697819314,
5271
+ "grad_norm": 1.15625,
5272
+ "learning_rate": 0.00013626950172277398,
5273
+ "loss": 6.1824,
5274
+ "step": 744
5275
+ },
5276
+ {
5277
+ "epoch": 0.7736240913811008,
5278
+ "grad_norm": 1.78125,
5279
+ "learning_rate": 0.00013509243997594423,
5280
+ "loss": 6.2045,
5281
+ "step": 745
5282
+ },
5283
+ {
5284
+ "epoch": 0.77466251298027,
5285
+ "grad_norm": 1.359375,
5286
+ "learning_rate": 0.00013391968934589572,
5287
+ "loss": 6.295,
5288
+ "step": 746
5289
+ },
5290
+ {
5291
+ "epoch": 0.7757009345794392,
5292
+ "grad_norm": 1.265625,
5293
+ "learning_rate": 0.00013275126368782235,
5294
+ "loss": 6.3082,
5295
+ "step": 747
5296
+ },
5297
+ {
5298
+ "epoch": 0.7767393561786086,
5299
+ "grad_norm": 1.5703125,
5300
+ "learning_rate": 0.00013158717680582127,
5301
+ "loss": 6.2444,
5302
+ "step": 748
5303
+ },
5304
+ {
5305
+ "epoch": 0.7777777777777778,
5306
+ "grad_norm": 1.4296875,
5307
+ "learning_rate": 0.00013042744245273037,
5308
+ "loss": 6.1545,
5309
+ "step": 749
5310
+ },
5311
+ {
5312
+ "epoch": 0.778816199376947,
5313
+ "grad_norm": 1.75,
5314
+ "learning_rate": 0.0001292720743299654,
5315
+ "loss": 6.0637,
5316
+ "step": 750
5317
+ },
5318
+ {
5319
+ "epoch": 0.7798546209761164,
5320
+ "grad_norm": 1.4453125,
5321
+ "learning_rate": 0.00012812108608735846,
5322
+ "loss": 6.0392,
5323
+ "step": 751
5324
+ },
5325
+ {
5326
+ "epoch": 0.7808930425752856,
5327
+ "grad_norm": 1.6796875,
5328
+ "learning_rate": 0.0001269744913229965,
5329
+ "loss": 6.285,
5330
+ "step": 752
5331
+ },
5332
+ {
5333
+ "epoch": 0.7819314641744548,
5334
+ "grad_norm": 1.21875,
5335
+ "learning_rate": 0.00012583230358306053,
5336
+ "loss": 6.3178,
5337
+ "step": 753
5338
+ },
5339
+ {
5340
+ "epoch": 0.7829698857736241,
5341
+ "grad_norm": 1.578125,
5342
+ "learning_rate": 0.00012469453636166643,
5343
+ "loss": 6.1123,
5344
+ "step": 754
5345
+ },
5346
+ {
5347
+ "epoch": 0.7840083073727934,
5348
+ "grad_norm": 1.453125,
5349
+ "learning_rate": 0.00012356120310070407,
5350
+ "loss": 6.379,
5351
+ "step": 755
5352
+ },
5353
+ {
5354
+ "epoch": 0.7850467289719626,
5355
+ "grad_norm": 1.625,
5356
+ "learning_rate": 0.00012243231718967967,
5357
+ "loss": 6.127,
5358
+ "step": 756
5359
+ },
5360
+ {
5361
+ "epoch": 0.7860851505711319,
5362
+ "grad_norm": 1.1796875,
5363
+ "learning_rate": 0.0001213078919655573,
5364
+ "loss": 6.222,
5365
+ "step": 757
5366
+ },
5367
+ {
5368
+ "epoch": 0.7871235721703012,
5369
+ "grad_norm": 1.1875,
5370
+ "learning_rate": 0.00012018794071260119,
5371
+ "loss": 6.0595,
5372
+ "step": 758
5373
+ },
5374
+ {
5375
+ "epoch": 0.7881619937694704,
5376
+ "grad_norm": 1.2421875,
5377
+ "learning_rate": 0.00011907247666221893,
5378
+ "loss": 6.1771,
5379
+ "step": 759
5380
+ },
5381
+ {
5382
+ "epoch": 0.7892004153686397,
5383
+ "grad_norm": 1.5,
5384
+ "learning_rate": 0.00011796151299280483,
5385
+ "loss": 6.1493,
5386
+ "step": 760
5387
+ },
5388
+ {
5389
+ "epoch": 0.7902388369678089,
5390
+ "grad_norm": 1.3046875,
5391
+ "learning_rate": 0.00011685506282958496,
5392
+ "loss": 6.3724,
5393
+ "step": 761
5394
+ },
5395
+ {
5396
+ "epoch": 0.7912772585669782,
5397
+ "grad_norm": 1.8203125,
5398
+ "learning_rate": 0.00011575313924446123,
5399
+ "loss": 6.1028,
5400
+ "step": 762
5401
+ },
5402
+ {
5403
+ "epoch": 0.7923156801661475,
5404
+ "grad_norm": 1.5234375,
5405
+ "learning_rate": 0.00011465575525585741,
5406
+ "loss": 6.1988,
5407
+ "step": 763
5408
+ },
5409
+ {
5410
+ "epoch": 0.7933541017653167,
5411
+ "grad_norm": 3.03125,
5412
+ "learning_rate": 0.00011356292382856532,
5413
+ "loss": 6.1213,
5414
+ "step": 764
5415
+ },
5416
+ {
5417
+ "epoch": 0.794392523364486,
5418
+ "grad_norm": 1.8515625,
5419
+ "learning_rate": 0.0001124746578735914,
5420
+ "loss": 6.3859,
5421
+ "step": 765
5422
+ },
5423
+ {
5424
+ "epoch": 0.7954309449636553,
5425
+ "grad_norm": 1.8984375,
5426
+ "learning_rate": 0.0001113909702480046,
5427
+ "loss": 6.0653,
5428
+ "step": 766
5429
+ },
5430
+ {
5431
+ "epoch": 0.7964693665628245,
5432
+ "grad_norm": 1.359375,
5433
+ "learning_rate": 0.00011031187375478407,
5434
+ "loss": 6.2933,
5435
+ "step": 767
5436
+ },
5437
+ {
5438
+ "epoch": 0.7975077881619937,
5439
+ "grad_norm": 1.3203125,
5440
+ "learning_rate": 0.00010923738114266823,
5441
+ "loss": 6.0991,
5442
+ "step": 768
5443
+ },
5444
+ {
5445
+ "epoch": 0.7985462097611631,
5446
+ "grad_norm": 1.4921875,
5447
+ "learning_rate": 0.00010816750510600387,
5448
+ "loss": 6.2484,
5449
+ "step": 769
5450
+ },
5451
+ {
5452
+ "epoch": 0.7995846313603323,
5453
+ "grad_norm": 1.703125,
5454
+ "learning_rate": 0.00010710225828459641,
5455
+ "loss": 5.7827,
5456
+ "step": 770
5457
+ },
5458
+ {
5459
+ "epoch": 0.8006230529595015,
5460
+ "grad_norm": 1.46875,
5461
+ "learning_rate": 0.0001060416532635603,
5462
+ "loss": 6.3373,
5463
+ "step": 771
5464
+ },
5465
+ {
5466
+ "epoch": 0.8016614745586709,
5467
+ "grad_norm": 1.2265625,
5468
+ "learning_rate": 0.00010498570257317076,
5469
+ "loss": 6.3325,
5470
+ "step": 772
5471
+ },
5472
+ {
5473
+ "epoch": 0.8026998961578401,
5474
+ "grad_norm": 1.578125,
5475
+ "learning_rate": 0.00010393441868871506,
5476
+ "loss": 6.1373,
5477
+ "step": 773
5478
+ },
5479
+ {
5480
+ "epoch": 0.8037383177570093,
5481
+ "grad_norm": 1.609375,
5482
+ "learning_rate": 0.00010288781403034619,
5483
+ "loss": 6.06,
5484
+ "step": 774
5485
+ },
5486
+ {
5487
+ "epoch": 0.8047767393561787,
5488
+ "grad_norm": 1.71875,
5489
+ "learning_rate": 0.00010184590096293506,
5490
+ "loss": 6.0622,
5491
+ "step": 775
5492
+ },
5493
+ {
5494
+ "epoch": 0.8058151609553479,
5495
+ "grad_norm": 1.0546875,
5496
+ "learning_rate": 0.0001008086917959249,
5497
+ "loss": 6.2272,
5498
+ "step": 776
5499
+ },
5500
+ {
5501
+ "epoch": 0.8068535825545171,
5502
+ "grad_norm": 1.3828125,
5503
+ "learning_rate": 9.977619878318578e-05,
5504
+ "loss": 6.2692,
5505
+ "step": 777
5506
+ },
5507
+ {
5508
+ "epoch": 0.8078920041536864,
5509
+ "grad_norm": 1.2265625,
5510
+ "learning_rate": 9.874843412286993e-05,
5511
+ "loss": 6.3817,
5512
+ "step": 778
5513
+ },
5514
+ {
5515
+ "epoch": 0.8089304257528557,
5516
+ "grad_norm": 1.9921875,
5517
+ "learning_rate": 9.772540995726753e-05,
5518
+ "loss": 6.4033,
5519
+ "step": 779
5520
+ },
5521
+ {
5522
+ "epoch": 0.8099688473520249,
5523
+ "grad_norm": 1.625,
5524
+ "learning_rate": 9.67071383726632e-05,
5525
+ "loss": 6.0418,
5526
+ "step": 780
5527
+ },
5528
+ {
5529
+ "epoch": 0.8110072689511942,
5530
+ "grad_norm": 1.3125,
5531
+ "learning_rate": 9.569363139919341e-05,
5532
+ "loss": 6.2407,
5533
+ "step": 781
5534
+ },
5535
+ {
5536
+ "epoch": 0.8120456905503635,
5537
+ "grad_norm": 1.6796875,
5538
+ "learning_rate": 9.468490101070409e-05,
5539
+ "loss": 6.1643,
5540
+ "step": 782
5541
+ },
5542
+ {
5543
+ "epoch": 0.8130841121495327,
5544
+ "grad_norm": 1.1796875,
5545
+ "learning_rate": 9.368095912460934e-05,
5546
+ "loss": 6.1331,
5547
+ "step": 783
5548
+ },
5549
+ {
5550
+ "epoch": 0.814122533748702,
5551
+ "grad_norm": 1.2421875,
5552
+ "learning_rate": 9.26818176017506e-05,
5553
+ "loss": 6.363,
5554
+ "step": 784
5555
+ },
5556
+ {
5557
+ "epoch": 0.8151609553478713,
5558
+ "grad_norm": 1.4609375,
5559
+ "learning_rate": 9.168748824625655e-05,
5560
+ "loss": 6.2178,
5561
+ "step": 785
5562
+ },
5563
+ {
5564
+ "epoch": 0.8161993769470405,
5565
+ "grad_norm": 1.6328125,
5566
+ "learning_rate": 9.069798280540348e-05,
5567
+ "loss": 6.146,
5568
+ "step": 786
5569
+ },
5570
+ {
5571
+ "epoch": 0.8172377985462098,
5572
+ "grad_norm": 1.4453125,
5573
+ "learning_rate": 8.9713312969477e-05,
5574
+ "loss": 6.1887,
5575
+ "step": 787
5576
+ },
5577
+ {
5578
+ "epoch": 0.818276220145379,
5579
+ "grad_norm": 1.2421875,
5580
+ "learning_rate": 8.87334903716332e-05,
5581
+ "loss": 6.2521,
5582
+ "step": 788
5583
+ },
5584
+ {
5585
+ "epoch": 0.8193146417445483,
5586
+ "grad_norm": 1.265625,
5587
+ "learning_rate": 8.775852658776173e-05,
5588
+ "loss": 6.3487,
5589
+ "step": 789
5590
+ },
5591
+ {
5592
+ "epoch": 0.8203530633437176,
5593
+ "grad_norm": 1.5234375,
5594
+ "learning_rate": 8.678843313634893e-05,
5595
+ "loss": 6.2509,
5596
+ "step": 790
5597
+ },
5598
+ {
5599
+ "epoch": 0.8213914849428868,
5600
+ "grad_norm": 1.71875,
5601
+ "learning_rate": 8.58232214783416e-05,
5602
+ "loss": 6.0586,
5603
+ "step": 791
5604
+ },
5605
+ {
5606
+ "epoch": 0.822429906542056,
5607
+ "grad_norm": 1.4921875,
5608
+ "learning_rate": 8.486290301701182e-05,
5609
+ "loss": 6.293,
5610
+ "step": 792
5611
+ },
5612
+ {
5613
+ "epoch": 0.8234683281412254,
5614
+ "grad_norm": 1.5,
5615
+ "learning_rate": 8.390748909782204e-05,
5616
+ "loss": 6.2504,
5617
+ "step": 793
5618
+ },
5619
+ {
5620
+ "epoch": 0.8245067497403946,
5621
+ "grad_norm": 1.28125,
5622
+ "learning_rate": 8.295699100829124e-05,
5623
+ "loss": 6.2907,
5624
+ "step": 794
5625
+ },
5626
+ {
5627
+ "epoch": 0.8255451713395638,
5628
+ "grad_norm": 1.125,
5629
+ "learning_rate": 8.201141997786127e-05,
5630
+ "loss": 6.2033,
5631
+ "step": 795
5632
+ },
5633
+ {
5634
+ "epoch": 0.8265835929387332,
5635
+ "grad_norm": 1.2109375,
5636
+ "learning_rate": 8.107078717776456e-05,
5637
+ "loss": 6.3058,
5638
+ "step": 796
5639
+ },
5640
+ {
5641
+ "epoch": 0.8276220145379024,
5642
+ "grad_norm": 1.4296875,
5643
+ "learning_rate": 8.013510372089184e-05,
5644
+ "loss": 6.1276,
5645
+ "step": 797
5646
+ },
5647
+ {
5648
+ "epoch": 0.8286604361370716,
5649
+ "grad_norm": 1.4140625,
5650
+ "learning_rate": 7.920438066166097e-05,
5651
+ "loss": 6.4023,
5652
+ "step": 798
5653
+ },
5654
+ {
5655
+ "epoch": 0.829698857736241,
5656
+ "grad_norm": 1.953125,
5657
+ "learning_rate": 7.827862899588634e-05,
5658
+ "loss": 6.1487,
5659
+ "step": 799
5660
+ },
5661
+ {
5662
+ "epoch": 0.8307372793354102,
5663
+ "grad_norm": 1.3671875,
5664
+ "learning_rate": 7.735785966064884e-05,
5665
+ "loss": 5.9001,
5666
+ "step": 800
5667
+ },
5668
+ {
5669
+ "epoch": 0.8307372793354102,
5670
+ "eval_loss": 6.255230903625488,
5671
+ "eval_runtime": 1.6449,
5672
+ "eval_samples_per_second": 9.727,
5673
+ "eval_steps_per_second": 1.216,
5674
+ "step": 800
5675
  }
5676
  ],
5677
  "logging_steps": 1,
 
5691
  "attributes": {}
5692
  }
5693
  },
5694
+ "total_flos": 1.018894554759168e+17,
5695
  "train_batch_size": 16,
5696
  "trial_name": null,
5697
  "trial_params": null