CocoRoF commited on
Commit
eb97b44
·
verified ·
1 Parent(s): f474e97

Training in progress, step 4000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:004c78f3624f93420c8bcc03a3ca9b8a37f2a3690b0727d9baad61326ff4924a
3
  size 791869518
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:22241369eebb806242a4b639f3c9754a5e2972e6f03c1324e455acf307e57407
3
  size 791869518
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f52c0af4dcd803a82e086998f8d31a06630a29f9e5698a02cb802f2b5768b16e
3
  size 2375752250
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93b057735ed1127cf96ad37213db6e59d41bf997da6d76cc4b664ddb0416cbdd
3
  size 2375752250
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:04cb5208648fd09a2e0403d51973f74ffbfd93cbd5da59e1e99c8df03769a86c
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e735ed11597ed40a2b6854e0229902e1a21fedc0a0dbc608ca905fae57d5b06b
3
  size 15984
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7034685b36b93a4dd3a50697b0b1c314b249b2189ec2cb96b757312b1514a579
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ba3815fc0953b1b7f08cea092dfc0a62c4bbc2a2c68780d3f4dd0b5e22582a7
3
  size 15984
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6e851fe1c1de0057f4eecefed6a131fa9021334eb43f6e7e65fdb270a25ac864
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:647ac15563fcad903adbb616e9b2c36b237a3ed5939d088620212da969930f6c
3
  size 15984
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:978379030048e432baa510ec4fc9514faa08fe564ab964b3a4d05e8f60306495
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93e3733c5b180986b7efbec17b663bf5231343d187374d184768fcd913797167
3
  size 15984
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bdbc75d90af112615b53d15931e8157a80e37bcd110aac9a3089f5f6f5344171
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9820ea4fec1b01f3da091290c3e8b5ddb86a3a3fa17285c248b64910c2d0b4f0
3
  size 15984
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8c8a310f6ca2ca89570eb2cc68544656b30224f00b2d6d96eeda6e0cb8be50ab
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7413035def085e41776a629afc94fc24fe5a955f1ad83b32f9b370ab60f9a18d
3
  size 15984
last-checkpoint/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7c5b8110fcf6e044b6860c6305be969cfe03129549b92dc6fc2394448e9265d6
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:91e3953bcbf4089415abffbd914fbbe4580121f6c843eabbf70624c5ed144814
3
  size 15984
last-checkpoint/rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6f936acaf5a2d5fe8c38d945450417facbf1577584c216908a396d3cc20bec88
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:369fde7bff4dfc0d6b9cf773cf9b0352696083f84763999e05a631ee6d52c5e3
3
  size 15984
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:47f6ae61b86ec1e2b5a6767419416b8803731fef933343a6831b194cd48a7616
3
  size 1000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1bef6a7bf53166ec3a9709e315e5a7afc807cf01be6b61a09c96b7113cbb6fd6
3
  size 1000
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.8091140923112489,
5
  "eval_steps": 500,
6
- "global_step": 3500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -4963,6 +4963,714 @@
4963
  "eval_samples_per_second": 608.68,
4964
  "eval_steps_per_second": 38.043,
4965
  "step": 3500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4966
  }
4967
  ],
4968
  "logging_steps": 5,
@@ -4982,7 +5690,7 @@
4982
  "attributes": {}
4983
  }
4984
  },
4985
- "total_flos": 1.5163252974760755e+19,
4986
  "train_batch_size": 4,
4987
  "trial_name": null,
4988
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9247018197842845,
5
  "eval_steps": 500,
6
+ "global_step": 4000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
4963
  "eval_samples_per_second": 608.68,
4964
  "eval_steps_per_second": 38.043,
4965
  "step": 3500
4966
+ },
4967
+ {
4968
+ "epoch": 0.8102699695859792,
4969
+ "grad_norm": 164.375,
4970
+ "learning_rate": 2.1068859198355603e-06,
4971
+ "loss": 67.4157,
4972
+ "step": 3505
4973
+ },
4974
+ {
4975
+ "epoch": 0.8114258468607095,
4976
+ "grad_norm": 159.5,
4977
+ "learning_rate": 2.094039054470709e-06,
4978
+ "loss": 67.4029,
4979
+ "step": 3510
4980
+ },
4981
+ {
4982
+ "epoch": 0.8125817241354399,
4983
+ "grad_norm": 143.25,
4984
+ "learning_rate": 2.0811921891058584e-06,
4985
+ "loss": 67.8529,
4986
+ "step": 3515
4987
+ },
4988
+ {
4989
+ "epoch": 0.8137376014101703,
4990
+ "grad_norm": 156.25,
4991
+ "learning_rate": 2.0683453237410072e-06,
4992
+ "loss": 66.8855,
4993
+ "step": 3520
4994
+ },
4995
+ {
4996
+ "epoch": 0.8148934786849006,
4997
+ "grad_norm": 168.75,
4998
+ "learning_rate": 2.0554984583761565e-06,
4999
+ "loss": 68.2299,
5000
+ "step": 3525
5001
+ },
5002
+ {
5003
+ "epoch": 0.816049355959631,
5004
+ "grad_norm": 149.0,
5005
+ "learning_rate": 2.0426515930113053e-06,
5006
+ "loss": 66.2473,
5007
+ "step": 3530
5008
+ },
5009
+ {
5010
+ "epoch": 0.8172052332343613,
5011
+ "grad_norm": 149.0,
5012
+ "learning_rate": 2.0298047276464546e-06,
5013
+ "loss": 66.8619,
5014
+ "step": 3535
5015
+ },
5016
+ {
5017
+ "epoch": 0.8183611105090917,
5018
+ "grad_norm": 171.375,
5019
+ "learning_rate": 2.0169578622816035e-06,
5020
+ "loss": 67.1076,
5021
+ "step": 3540
5022
+ },
5023
+ {
5024
+ "epoch": 0.8195169877838221,
5025
+ "grad_norm": 153.875,
5026
+ "learning_rate": 2.0041109969167523e-06,
5027
+ "loss": 68.1818,
5028
+ "step": 3545
5029
+ },
5030
+ {
5031
+ "epoch": 0.8206728650585524,
5032
+ "grad_norm": 169.25,
5033
+ "learning_rate": 1.9912641315519016e-06,
5034
+ "loss": 67.3567,
5035
+ "step": 3550
5036
+ },
5037
+ {
5038
+ "epoch": 0.8218287423332827,
5039
+ "grad_norm": 175.75,
5040
+ "learning_rate": 1.9784172661870504e-06,
5041
+ "loss": 68.2824,
5042
+ "step": 3555
5043
+ },
5044
+ {
5045
+ "epoch": 0.8229846196080132,
5046
+ "grad_norm": 157.125,
5047
+ "learning_rate": 1.9655704008221997e-06,
5048
+ "loss": 68.0355,
5049
+ "step": 3560
5050
+ },
5051
+ {
5052
+ "epoch": 0.8241404968827435,
5053
+ "grad_norm": 166.875,
5054
+ "learning_rate": 1.9527235354573485e-06,
5055
+ "loss": 65.4001,
5056
+ "step": 3565
5057
+ },
5058
+ {
5059
+ "epoch": 0.8252963741574738,
5060
+ "grad_norm": 161.125,
5061
+ "learning_rate": 1.9398766700924974e-06,
5062
+ "loss": 66.6461,
5063
+ "step": 3570
5064
+ },
5065
+ {
5066
+ "epoch": 0.8264522514322041,
5067
+ "grad_norm": 158.375,
5068
+ "learning_rate": 1.9270298047276467e-06,
5069
+ "loss": 66.6772,
5070
+ "step": 3575
5071
+ },
5072
+ {
5073
+ "epoch": 0.8276081287069346,
5074
+ "grad_norm": 147.75,
5075
+ "learning_rate": 1.9141829393627955e-06,
5076
+ "loss": 66.801,
5077
+ "step": 3580
5078
+ },
5079
+ {
5080
+ "epoch": 0.8287640059816649,
5081
+ "grad_norm": 155.25,
5082
+ "learning_rate": 1.9013360739979448e-06,
5083
+ "loss": 66.6397,
5084
+ "step": 3585
5085
+ },
5086
+ {
5087
+ "epoch": 0.8299198832563952,
5088
+ "grad_norm": 167.625,
5089
+ "learning_rate": 1.8884892086330936e-06,
5090
+ "loss": 67.2453,
5091
+ "step": 3590
5092
+ },
5093
+ {
5094
+ "epoch": 0.8310757605311256,
5095
+ "grad_norm": 158.5,
5096
+ "learning_rate": 1.8756423432682427e-06,
5097
+ "loss": 67.7861,
5098
+ "step": 3595
5099
+ },
5100
+ {
5101
+ "epoch": 0.832231637805856,
5102
+ "grad_norm": 155.625,
5103
+ "learning_rate": 1.8627954779033915e-06,
5104
+ "loss": 68.5794,
5105
+ "step": 3600
5106
+ },
5107
+ {
5108
+ "epoch": 0.8333875150805863,
5109
+ "grad_norm": 158.625,
5110
+ "learning_rate": 1.8499486125385408e-06,
5111
+ "loss": 68.7008,
5112
+ "step": 3605
5113
+ },
5114
+ {
5115
+ "epoch": 0.8345433923553167,
5116
+ "grad_norm": 168.25,
5117
+ "learning_rate": 1.8371017471736899e-06,
5118
+ "loss": 67.8454,
5119
+ "step": 3610
5120
+ },
5121
+ {
5122
+ "epoch": 0.835699269630047,
5123
+ "grad_norm": 155.0,
5124
+ "learning_rate": 1.8242548818088387e-06,
5125
+ "loss": 67.5842,
5126
+ "step": 3615
5127
+ },
5128
+ {
5129
+ "epoch": 0.8368551469047774,
5130
+ "grad_norm": 169.0,
5131
+ "learning_rate": 1.811408016443988e-06,
5132
+ "loss": 66.7899,
5133
+ "step": 3620
5134
+ },
5135
+ {
5136
+ "epoch": 0.8380110241795078,
5137
+ "grad_norm": 165.25,
5138
+ "learning_rate": 1.7985611510791368e-06,
5139
+ "loss": 66.9411,
5140
+ "step": 3625
5141
+ },
5142
+ {
5143
+ "epoch": 0.8391669014542381,
5144
+ "grad_norm": 158.125,
5145
+ "learning_rate": 1.7857142857142859e-06,
5146
+ "loss": 65.2401,
5147
+ "step": 3630
5148
+ },
5149
+ {
5150
+ "epoch": 0.8403227787289684,
5151
+ "grad_norm": 152.5,
5152
+ "learning_rate": 1.7728674203494347e-06,
5153
+ "loss": 66.357,
5154
+ "step": 3635
5155
+ },
5156
+ {
5157
+ "epoch": 0.8414786560036988,
5158
+ "grad_norm": 166.75,
5159
+ "learning_rate": 1.760020554984584e-06,
5160
+ "loss": 66.9579,
5161
+ "step": 3640
5162
+ },
5163
+ {
5164
+ "epoch": 0.8426345332784292,
5165
+ "grad_norm": 151.875,
5166
+ "learning_rate": 1.747173689619733e-06,
5167
+ "loss": 68.2974,
5168
+ "step": 3645
5169
+ },
5170
+ {
5171
+ "epoch": 0.8437904105531595,
5172
+ "grad_norm": 151.75,
5173
+ "learning_rate": 1.734326824254882e-06,
5174
+ "loss": 66.9873,
5175
+ "step": 3650
5176
+ },
5177
+ {
5178
+ "epoch": 0.8449462878278898,
5179
+ "grad_norm": 168.125,
5180
+ "learning_rate": 1.721479958890031e-06,
5181
+ "loss": 67.4954,
5182
+ "step": 3655
5183
+ },
5184
+ {
5185
+ "epoch": 0.8461021651026203,
5186
+ "grad_norm": 148.5,
5187
+ "learning_rate": 1.7086330935251798e-06,
5188
+ "loss": 67.3915,
5189
+ "step": 3660
5190
+ },
5191
+ {
5192
+ "epoch": 0.8472580423773506,
5193
+ "grad_norm": 159.125,
5194
+ "learning_rate": 1.695786228160329e-06,
5195
+ "loss": 67.0368,
5196
+ "step": 3665
5197
+ },
5198
+ {
5199
+ "epoch": 0.8484139196520809,
5200
+ "grad_norm": 158.0,
5201
+ "learning_rate": 1.682939362795478e-06,
5202
+ "loss": 67.0658,
5203
+ "step": 3670
5204
+ },
5205
+ {
5206
+ "epoch": 0.8495697969268113,
5207
+ "grad_norm": 152.625,
5208
+ "learning_rate": 1.670092497430627e-06,
5209
+ "loss": 65.5299,
5210
+ "step": 3675
5211
+ },
5212
+ {
5213
+ "epoch": 0.8507256742015417,
5214
+ "grad_norm": 151.125,
5215
+ "learning_rate": 1.6572456320657763e-06,
5216
+ "loss": 66.0855,
5217
+ "step": 3680
5218
+ },
5219
+ {
5220
+ "epoch": 0.851881551476272,
5221
+ "grad_norm": 153.5,
5222
+ "learning_rate": 1.6443987667009251e-06,
5223
+ "loss": 67.1155,
5224
+ "step": 3685
5225
+ },
5226
+ {
5227
+ "epoch": 0.8530374287510024,
5228
+ "grad_norm": 146.25,
5229
+ "learning_rate": 1.6315519013360742e-06,
5230
+ "loss": 66.6861,
5231
+ "step": 3690
5232
+ },
5233
+ {
5234
+ "epoch": 0.8541933060257327,
5235
+ "grad_norm": 166.0,
5236
+ "learning_rate": 1.618705035971223e-06,
5237
+ "loss": 67.9181,
5238
+ "step": 3695
5239
+ },
5240
+ {
5241
+ "epoch": 0.8553491833004631,
5242
+ "grad_norm": 150.625,
5243
+ "learning_rate": 1.6058581706063723e-06,
5244
+ "loss": 67.422,
5245
+ "step": 3700
5246
+ },
5247
+ {
5248
+ "epoch": 0.8565050605751934,
5249
+ "grad_norm": 159.0,
5250
+ "learning_rate": 1.5930113052415211e-06,
5251
+ "loss": 66.2905,
5252
+ "step": 3705
5253
+ },
5254
+ {
5255
+ "epoch": 0.8576609378499238,
5256
+ "grad_norm": 159.125,
5257
+ "learning_rate": 1.5801644398766702e-06,
5258
+ "loss": 67.4452,
5259
+ "step": 3710
5260
+ },
5261
+ {
5262
+ "epoch": 0.8588168151246541,
5263
+ "grad_norm": 151.875,
5264
+ "learning_rate": 1.5673175745118195e-06,
5265
+ "loss": 66.2737,
5266
+ "step": 3715
5267
+ },
5268
+ {
5269
+ "epoch": 0.8599726923993845,
5270
+ "grad_norm": 166.5,
5271
+ "learning_rate": 1.5544707091469683e-06,
5272
+ "loss": 67.6893,
5273
+ "step": 3720
5274
+ },
5275
+ {
5276
+ "epoch": 0.8611285696741149,
5277
+ "grad_norm": 158.375,
5278
+ "learning_rate": 1.5416238437821174e-06,
5279
+ "loss": 67.26,
5280
+ "step": 3725
5281
+ },
5282
+ {
5283
+ "epoch": 0.8622844469488452,
5284
+ "grad_norm": 164.25,
5285
+ "learning_rate": 1.5287769784172662e-06,
5286
+ "loss": 67.7357,
5287
+ "step": 3730
5288
+ },
5289
+ {
5290
+ "epoch": 0.8634403242235755,
5291
+ "grad_norm": 156.125,
5292
+ "learning_rate": 1.5159301130524153e-06,
5293
+ "loss": 65.816,
5294
+ "step": 3735
5295
+ },
5296
+ {
5297
+ "epoch": 0.864596201498306,
5298
+ "grad_norm": 158.0,
5299
+ "learning_rate": 1.5030832476875643e-06,
5300
+ "loss": 66.2453,
5301
+ "step": 3740
5302
+ },
5303
+ {
5304
+ "epoch": 0.8657520787730363,
5305
+ "grad_norm": 146.75,
5306
+ "learning_rate": 1.4902363823227134e-06,
5307
+ "loss": 66.5201,
5308
+ "step": 3745
5309
+ },
5310
+ {
5311
+ "epoch": 0.8669079560477666,
5312
+ "grad_norm": 161.125,
5313
+ "learning_rate": 1.4773895169578625e-06,
5314
+ "loss": 67.2529,
5315
+ "step": 3750
5316
+ },
5317
+ {
5318
+ "epoch": 0.868063833322497,
5319
+ "grad_norm": 180.5,
5320
+ "learning_rate": 1.4645426515930113e-06,
5321
+ "loss": 66.3079,
5322
+ "step": 3755
5323
+ },
5324
+ {
5325
+ "epoch": 0.8692197105972274,
5326
+ "grad_norm": 149.375,
5327
+ "learning_rate": 1.4516957862281606e-06,
5328
+ "loss": 68.2998,
5329
+ "step": 3760
5330
+ },
5331
+ {
5332
+ "epoch": 0.8703755878719577,
5333
+ "grad_norm": 165.875,
5334
+ "learning_rate": 1.4388489208633094e-06,
5335
+ "loss": 66.4161,
5336
+ "step": 3765
5337
+ },
5338
+ {
5339
+ "epoch": 0.871531465146688,
5340
+ "grad_norm": 164.375,
5341
+ "learning_rate": 1.4260020554984585e-06,
5342
+ "loss": 66.5728,
5343
+ "step": 3770
5344
+ },
5345
+ {
5346
+ "epoch": 0.8726873424214184,
5347
+ "grad_norm": 150.125,
5348
+ "learning_rate": 1.4131551901336073e-06,
5349
+ "loss": 66.1363,
5350
+ "step": 3775
5351
+ },
5352
+ {
5353
+ "epoch": 0.8738432196961488,
5354
+ "grad_norm": 143.75,
5355
+ "learning_rate": 1.4003083247687566e-06,
5356
+ "loss": 67.2544,
5357
+ "step": 3780
5358
+ },
5359
+ {
5360
+ "epoch": 0.8749990969708791,
5361
+ "grad_norm": 148.75,
5362
+ "learning_rate": 1.3874614594039057e-06,
5363
+ "loss": 65.8916,
5364
+ "step": 3785
5365
+ },
5366
+ {
5367
+ "epoch": 0.8761549742456095,
5368
+ "grad_norm": 163.375,
5369
+ "learning_rate": 1.3746145940390545e-06,
5370
+ "loss": 65.9047,
5371
+ "step": 3790
5372
+ },
5373
+ {
5374
+ "epoch": 0.8773108515203398,
5375
+ "grad_norm": 153.625,
5376
+ "learning_rate": 1.3617677286742038e-06,
5377
+ "loss": 66.0558,
5378
+ "step": 3795
5379
+ },
5380
+ {
5381
+ "epoch": 0.8784667287950702,
5382
+ "grad_norm": 168.75,
5383
+ "learning_rate": 1.3489208633093526e-06,
5384
+ "loss": 67.7262,
5385
+ "step": 3800
5386
+ },
5387
+ {
5388
+ "epoch": 0.8796226060698006,
5389
+ "grad_norm": 154.5,
5390
+ "learning_rate": 1.3360739979445017e-06,
5391
+ "loss": 66.8116,
5392
+ "step": 3805
5393
+ },
5394
+ {
5395
+ "epoch": 0.8807784833445309,
5396
+ "grad_norm": 141.875,
5397
+ "learning_rate": 1.3232271325796505e-06,
5398
+ "loss": 66.8746,
5399
+ "step": 3810
5400
+ },
5401
+ {
5402
+ "epoch": 0.8819343606192612,
5403
+ "grad_norm": 162.0,
5404
+ "learning_rate": 1.3103802672147998e-06,
5405
+ "loss": 66.7104,
5406
+ "step": 3815
5407
+ },
5408
+ {
5409
+ "epoch": 0.8830902378939917,
5410
+ "grad_norm": 147.75,
5411
+ "learning_rate": 1.2975334018499486e-06,
5412
+ "loss": 66.5884,
5413
+ "step": 3820
5414
+ },
5415
+ {
5416
+ "epoch": 0.884246115168722,
5417
+ "grad_norm": 156.125,
5418
+ "learning_rate": 1.2846865364850977e-06,
5419
+ "loss": 68.1134,
5420
+ "step": 3825
5421
+ },
5422
+ {
5423
+ "epoch": 0.8854019924434523,
5424
+ "grad_norm": 159.5,
5425
+ "learning_rate": 1.2718396711202468e-06,
5426
+ "loss": 66.4509,
5427
+ "step": 3830
5428
+ },
5429
+ {
5430
+ "epoch": 0.8865578697181826,
5431
+ "grad_norm": 178.5,
5432
+ "learning_rate": 1.2589928057553958e-06,
5433
+ "loss": 65.9568,
5434
+ "step": 3835
5435
+ },
5436
+ {
5437
+ "epoch": 0.887713746992913,
5438
+ "grad_norm": 177.75,
5439
+ "learning_rate": 1.2461459403905449e-06,
5440
+ "loss": 68.2432,
5441
+ "step": 3840
5442
+ },
5443
+ {
5444
+ "epoch": 0.8888696242676434,
5445
+ "grad_norm": 145.0,
5446
+ "learning_rate": 1.233299075025694e-06,
5447
+ "loss": 65.4491,
5448
+ "step": 3845
5449
+ },
5450
+ {
5451
+ "epoch": 0.8900255015423737,
5452
+ "grad_norm": 150.25,
5453
+ "learning_rate": 1.2204522096608428e-06,
5454
+ "loss": 66.6217,
5455
+ "step": 3850
5456
+ },
5457
+ {
5458
+ "epoch": 0.8911813788171041,
5459
+ "grad_norm": 167.25,
5460
+ "learning_rate": 1.2076053442959918e-06,
5461
+ "loss": 66.3795,
5462
+ "step": 3855
5463
+ },
5464
+ {
5465
+ "epoch": 0.8923372560918345,
5466
+ "grad_norm": 159.125,
5467
+ "learning_rate": 1.194758478931141e-06,
5468
+ "loss": 67.1028,
5469
+ "step": 3860
5470
+ },
5471
+ {
5472
+ "epoch": 0.8934931333665648,
5473
+ "grad_norm": 152.5,
5474
+ "learning_rate": 1.18191161356629e-06,
5475
+ "loss": 66.2524,
5476
+ "step": 3865
5477
+ },
5478
+ {
5479
+ "epoch": 0.8946490106412952,
5480
+ "grad_norm": 155.25,
5481
+ "learning_rate": 1.1690647482014388e-06,
5482
+ "loss": 65.7507,
5483
+ "step": 3870
5484
+ },
5485
+ {
5486
+ "epoch": 0.8958048879160255,
5487
+ "grad_norm": 162.25,
5488
+ "learning_rate": 1.156217882836588e-06,
5489
+ "loss": 66.8517,
5490
+ "step": 3875
5491
+ },
5492
+ {
5493
+ "epoch": 0.8969607651907558,
5494
+ "grad_norm": 180.875,
5495
+ "learning_rate": 1.1433710174717371e-06,
5496
+ "loss": 66.3441,
5497
+ "step": 3880
5498
+ },
5499
+ {
5500
+ "epoch": 0.8981166424654863,
5501
+ "grad_norm": 156.75,
5502
+ "learning_rate": 1.130524152106886e-06,
5503
+ "loss": 65.7155,
5504
+ "step": 3885
5505
+ },
5506
+ {
5507
+ "epoch": 0.8992725197402166,
5508
+ "grad_norm": 163.0,
5509
+ "learning_rate": 1.117677286742035e-06,
5510
+ "loss": 66.562,
5511
+ "step": 3890
5512
+ },
5513
+ {
5514
+ "epoch": 0.9004283970149469,
5515
+ "grad_norm": 155.0,
5516
+ "learning_rate": 1.1048304213771841e-06,
5517
+ "loss": 66.4161,
5518
+ "step": 3895
5519
+ },
5520
+ {
5521
+ "epoch": 0.9015842742896772,
5522
+ "grad_norm": 148.625,
5523
+ "learning_rate": 1.091983556012333e-06,
5524
+ "loss": 65.2983,
5525
+ "step": 3900
5526
+ },
5527
+ {
5528
+ "epoch": 0.9027401515644077,
5529
+ "grad_norm": 155.25,
5530
+ "learning_rate": 1.079136690647482e-06,
5531
+ "loss": 64.831,
5532
+ "step": 3905
5533
+ },
5534
+ {
5535
+ "epoch": 0.903896028839138,
5536
+ "grad_norm": 160.75,
5537
+ "learning_rate": 1.066289825282631e-06,
5538
+ "loss": 66.6982,
5539
+ "step": 3910
5540
+ },
5541
+ {
5542
+ "epoch": 0.9050519061138683,
5543
+ "grad_norm": 159.625,
5544
+ "learning_rate": 1.0534429599177801e-06,
5545
+ "loss": 66.8952,
5546
+ "step": 3915
5547
+ },
5548
+ {
5549
+ "epoch": 0.9062077833885988,
5550
+ "grad_norm": 151.5,
5551
+ "learning_rate": 1.0405960945529292e-06,
5552
+ "loss": 66.3558,
5553
+ "step": 3920
5554
+ },
5555
+ {
5556
+ "epoch": 0.9073636606633291,
5557
+ "grad_norm": 162.25,
5558
+ "learning_rate": 1.0277492291880783e-06,
5559
+ "loss": 66.1818,
5560
+ "step": 3925
5561
+ },
5562
+ {
5563
+ "epoch": 0.9085195379380594,
5564
+ "grad_norm": 146.125,
5565
+ "learning_rate": 1.0149023638232273e-06,
5566
+ "loss": 66.7638,
5567
+ "step": 3930
5568
+ },
5569
+ {
5570
+ "epoch": 0.9096754152127898,
5571
+ "grad_norm": 158.0,
5572
+ "learning_rate": 1.0020554984583762e-06,
5573
+ "loss": 65.9911,
5574
+ "step": 3935
5575
+ },
5576
+ {
5577
+ "epoch": 0.9108312924875201,
5578
+ "grad_norm": 156.5,
5579
+ "learning_rate": 9.892086330935252e-07,
5580
+ "loss": 66.3479,
5581
+ "step": 3940
5582
+ },
5583
+ {
5584
+ "epoch": 0.9119871697622505,
5585
+ "grad_norm": 153.75,
5586
+ "learning_rate": 9.763617677286743e-07,
5587
+ "loss": 66.4355,
5588
+ "step": 3945
5589
+ },
5590
+ {
5591
+ "epoch": 0.9131430470369809,
5592
+ "grad_norm": 164.875,
5593
+ "learning_rate": 9.635149023638233e-07,
5594
+ "loss": 65.8444,
5595
+ "step": 3950
5596
+ },
5597
+ {
5598
+ "epoch": 0.9142989243117112,
5599
+ "grad_norm": 157.875,
5600
+ "learning_rate": 9.506680369989724e-07,
5601
+ "loss": 65.7399,
5602
+ "step": 3955
5603
+ },
5604
+ {
5605
+ "epoch": 0.9154548015864415,
5606
+ "grad_norm": 159.0,
5607
+ "learning_rate": 9.378211716341213e-07,
5608
+ "loss": 67.6389,
5609
+ "step": 3960
5610
+ },
5611
+ {
5612
+ "epoch": 0.9166106788611719,
5613
+ "grad_norm": 165.625,
5614
+ "learning_rate": 9.249743062692704e-07,
5615
+ "loss": 67.185,
5616
+ "step": 3965
5617
+ },
5618
+ {
5619
+ "epoch": 0.9177665561359023,
5620
+ "grad_norm": 154.5,
5621
+ "learning_rate": 9.121274409044194e-07,
5622
+ "loss": 66.45,
5623
+ "step": 3970
5624
+ },
5625
+ {
5626
+ "epoch": 0.9189224334106326,
5627
+ "grad_norm": 150.125,
5628
+ "learning_rate": 8.992805755395684e-07,
5629
+ "loss": 65.8415,
5630
+ "step": 3975
5631
+ },
5632
+ {
5633
+ "epoch": 0.9200783106853629,
5634
+ "grad_norm": 155.125,
5635
+ "learning_rate": 8.864337101747174e-07,
5636
+ "loss": 65.8874,
5637
+ "step": 3980
5638
+ },
5639
+ {
5640
+ "epoch": 0.9212341879600934,
5641
+ "grad_norm": 158.375,
5642
+ "learning_rate": 8.735868448098665e-07,
5643
+ "loss": 66.4925,
5644
+ "step": 3985
5645
+ },
5646
+ {
5647
+ "epoch": 0.9223900652348237,
5648
+ "grad_norm": 155.25,
5649
+ "learning_rate": 8.607399794450155e-07,
5650
+ "loss": 65.5232,
5651
+ "step": 3990
5652
+ },
5653
+ {
5654
+ "epoch": 0.923545942509554,
5655
+ "grad_norm": 147.5,
5656
+ "learning_rate": 8.478931140801645e-07,
5657
+ "loss": 64.7553,
5658
+ "step": 3995
5659
+ },
5660
+ {
5661
+ "epoch": 0.9247018197842845,
5662
+ "grad_norm": 161.75,
5663
+ "learning_rate": 8.350462487153135e-07,
5664
+ "loss": 66.8576,
5665
+ "step": 4000
5666
+ },
5667
+ {
5668
+ "epoch": 0.9247018197842845,
5669
+ "eval_loss": NaN,
5670
+ "eval_runtime": 383.9543,
5671
+ "eval_samples_per_second": 607.192,
5672
+ "eval_steps_per_second": 37.95,
5673
+ "step": 4000
5674
  }
5675
  ],
5676
  "logging_steps": 5,
 
5690
  "attributes": {}
5691
  }
5692
  },
5693
+ "total_flos": 1.7329431971155149e+19,
5694
  "train_batch_size": 4,
5695
  "trial_name": null,
5696
  "trial_params": null