CocoRoF commited on
Commit
8ce0e00
·
verified ·
1 Parent(s): 4d04e00

Training in progress, step 4000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cdd14f9cdc77a23d18ca191f1dfe68ba64a190cf53ce34243c487e0160d64132
3
  size 791869518
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d93859b3f54bac8daa5949d1dd355a2a00d83ee3a118f96100fa3ebb9da8bc60
3
  size 791869518
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4096a703c1fe1bec508db40b6aea5e14a5713770a1786d77aed3cd5040123021
3
  size 2375752250
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7937948b64a0cbcfba3f2fac38861165be5e6be90d0b693ca42ac510f2a90bef
3
  size 2375752250
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:04cb5208648fd09a2e0403d51973f74ffbfd93cbd5da59e1e99c8df03769a86c
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e735ed11597ed40a2b6854e0229902e1a21fedc0a0dbc608ca905fae57d5b06b
3
  size 15984
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7034685b36b93a4dd3a50697b0b1c314b249b2189ec2cb96b757312b1514a579
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ba3815fc0953b1b7f08cea092dfc0a62c4bbc2a2c68780d3f4dd0b5e22582a7
3
  size 15984
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6e851fe1c1de0057f4eecefed6a131fa9021334eb43f6e7e65fdb270a25ac864
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:647ac15563fcad903adbb616e9b2c36b237a3ed5939d088620212da969930f6c
3
  size 15984
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:978379030048e432baa510ec4fc9514faa08fe564ab964b3a4d05e8f60306495
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93e3733c5b180986b7efbec17b663bf5231343d187374d184768fcd913797167
3
  size 15984
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bdbc75d90af112615b53d15931e8157a80e37bcd110aac9a3089f5f6f5344171
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9820ea4fec1b01f3da091290c3e8b5ddb86a3a3fa17285c248b64910c2d0b4f0
3
  size 15984
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8c8a310f6ca2ca89570eb2cc68544656b30224f00b2d6d96eeda6e0cb8be50ab
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7413035def085e41776a629afc94fc24fe5a955f1ad83b32f9b370ab60f9a18d
3
  size 15984
last-checkpoint/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7c5b8110fcf6e044b6860c6305be969cfe03129549b92dc6fc2394448e9265d6
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:91e3953bcbf4089415abffbd914fbbe4580121f6c843eabbf70624c5ed144814
3
  size 15984
last-checkpoint/rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6f936acaf5a2d5fe8c38d945450417facbf1577584c216908a396d3cc20bec88
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:369fde7bff4dfc0d6b9cf773cf9b0352696083f84763999e05a631ee6d52c5e3
3
  size 15984
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:47f6ae61b86ec1e2b5a6767419416b8803731fef933343a6831b194cd48a7616
3
  size 1000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1bef6a7bf53166ec3a9709e315e5a7afc807cf01be6b61a09c96b7113cbb6fd6
3
  size 1000
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.8091140923112489,
5
  "eval_steps": 500,
6
- "global_step": 3500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -4963,6 +4963,714 @@
4963
  "eval_samples_per_second": 605.018,
4964
  "eval_steps_per_second": 37.814,
4965
  "step": 3500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4966
  }
4967
  ],
4968
  "logging_steps": 5,
@@ -4982,7 +5690,7 @@
4982
  "attributes": {}
4983
  }
4984
  },
4985
- "total_flos": 1.5163252974760755e+19,
4986
  "train_batch_size": 4,
4987
  "trial_name": null,
4988
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9247018197842845,
5
  "eval_steps": 500,
6
+ "global_step": 4000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
4963
  "eval_samples_per_second": 605.018,
4964
  "eval_steps_per_second": 37.814,
4965
  "step": 3500
4966
+ },
4967
+ {
4968
+ "epoch": 0.8102699695859792,
4969
+ "grad_norm": 143.625,
4970
+ "learning_rate": 2.1068859198355603e-06,
4971
+ "loss": 84.304,
4972
+ "step": 3505
4973
+ },
4974
+ {
4975
+ "epoch": 0.8114258468607095,
4976
+ "grad_norm": 150.75,
4977
+ "learning_rate": 2.094039054470709e-06,
4978
+ "loss": 84.0018,
4979
+ "step": 3510
4980
+ },
4981
+ {
4982
+ "epoch": 0.8125817241354399,
4983
+ "grad_norm": 142.0,
4984
+ "learning_rate": 2.0811921891058584e-06,
4985
+ "loss": 84.433,
4986
+ "step": 3515
4987
+ },
4988
+ {
4989
+ "epoch": 0.8137376014101703,
4990
+ "grad_norm": 143.25,
4991
+ "learning_rate": 2.0683453237410072e-06,
4992
+ "loss": 84.3377,
4993
+ "step": 3520
4994
+ },
4995
+ {
4996
+ "epoch": 0.8148934786849006,
4997
+ "grad_norm": 146.125,
4998
+ "learning_rate": 2.0554984583761565e-06,
4999
+ "loss": 84.5472,
5000
+ "step": 3525
5001
+ },
5002
+ {
5003
+ "epoch": 0.816049355959631,
5004
+ "grad_norm": 151.5,
5005
+ "learning_rate": 2.0426515930113053e-06,
5006
+ "loss": 82.4523,
5007
+ "step": 3530
5008
+ },
5009
+ {
5010
+ "epoch": 0.8172052332343613,
5011
+ "grad_norm": 141.125,
5012
+ "learning_rate": 2.0298047276464546e-06,
5013
+ "loss": 84.5754,
5014
+ "step": 3535
5015
+ },
5016
+ {
5017
+ "epoch": 0.8183611105090917,
5018
+ "grad_norm": 147.5,
5019
+ "learning_rate": 2.0169578622816035e-06,
5020
+ "loss": 82.441,
5021
+ "step": 3540
5022
+ },
5023
+ {
5024
+ "epoch": 0.8195169877838221,
5025
+ "grad_norm": 146.625,
5026
+ "learning_rate": 2.0041109969167523e-06,
5027
+ "loss": 86.3065,
5028
+ "step": 3545
5029
+ },
5030
+ {
5031
+ "epoch": 0.8206728650585524,
5032
+ "grad_norm": 152.5,
5033
+ "learning_rate": 1.9912641315519016e-06,
5034
+ "loss": 83.517,
5035
+ "step": 3550
5036
+ },
5037
+ {
5038
+ "epoch": 0.8218287423332827,
5039
+ "grad_norm": 145.875,
5040
+ "learning_rate": 1.9784172661870504e-06,
5041
+ "loss": 84.9357,
5042
+ "step": 3555
5043
+ },
5044
+ {
5045
+ "epoch": 0.8229846196080132,
5046
+ "grad_norm": 134.75,
5047
+ "learning_rate": 1.9655704008221997e-06,
5048
+ "loss": 83.7317,
5049
+ "step": 3560
5050
+ },
5051
+ {
5052
+ "epoch": 0.8241404968827435,
5053
+ "grad_norm": 152.0,
5054
+ "learning_rate": 1.9527235354573485e-06,
5055
+ "loss": 81.6789,
5056
+ "step": 3565
5057
+ },
5058
+ {
5059
+ "epoch": 0.8252963741574738,
5060
+ "grad_norm": 165.0,
5061
+ "learning_rate": 1.9398766700924974e-06,
5062
+ "loss": 83.5837,
5063
+ "step": 3570
5064
+ },
5065
+ {
5066
+ "epoch": 0.8264522514322041,
5067
+ "grad_norm": 143.375,
5068
+ "learning_rate": 1.9270298047276467e-06,
5069
+ "loss": 84.4118,
5070
+ "step": 3575
5071
+ },
5072
+ {
5073
+ "epoch": 0.8276081287069346,
5074
+ "grad_norm": 141.375,
5075
+ "learning_rate": 1.9141829393627955e-06,
5076
+ "loss": 81.7501,
5077
+ "step": 3580
5078
+ },
5079
+ {
5080
+ "epoch": 0.8287640059816649,
5081
+ "grad_norm": 136.375,
5082
+ "learning_rate": 1.9013360739979448e-06,
5083
+ "loss": 83.1388,
5084
+ "step": 3585
5085
+ },
5086
+ {
5087
+ "epoch": 0.8299198832563952,
5088
+ "grad_norm": 152.25,
5089
+ "learning_rate": 1.8884892086330936e-06,
5090
+ "loss": 82.2967,
5091
+ "step": 3590
5092
+ },
5093
+ {
5094
+ "epoch": 0.8310757605311256,
5095
+ "grad_norm": 141.875,
5096
+ "learning_rate": 1.8756423432682427e-06,
5097
+ "loss": 83.2457,
5098
+ "step": 3595
5099
+ },
5100
+ {
5101
+ "epoch": 0.832231637805856,
5102
+ "grad_norm": 146.625,
5103
+ "learning_rate": 1.8627954779033915e-06,
5104
+ "loss": 84.0741,
5105
+ "step": 3600
5106
+ },
5107
+ {
5108
+ "epoch": 0.8333875150805863,
5109
+ "grad_norm": 142.875,
5110
+ "learning_rate": 1.8499486125385408e-06,
5111
+ "loss": 84.0243,
5112
+ "step": 3605
5113
+ },
5114
+ {
5115
+ "epoch": 0.8345433923553167,
5116
+ "grad_norm": 143.0,
5117
+ "learning_rate": 1.8371017471736899e-06,
5118
+ "loss": 84.5223,
5119
+ "step": 3610
5120
+ },
5121
+ {
5122
+ "epoch": 0.835699269630047,
5123
+ "grad_norm": 139.25,
5124
+ "learning_rate": 1.8242548818088387e-06,
5125
+ "loss": 82.374,
5126
+ "step": 3615
5127
+ },
5128
+ {
5129
+ "epoch": 0.8368551469047774,
5130
+ "grad_norm": 135.875,
5131
+ "learning_rate": 1.811408016443988e-06,
5132
+ "loss": 83.3824,
5133
+ "step": 3620
5134
+ },
5135
+ {
5136
+ "epoch": 0.8380110241795078,
5137
+ "grad_norm": 146.875,
5138
+ "learning_rate": 1.7985611510791368e-06,
5139
+ "loss": 82.5953,
5140
+ "step": 3625
5141
+ },
5142
+ {
5143
+ "epoch": 0.8391669014542381,
5144
+ "grad_norm": 140.125,
5145
+ "learning_rate": 1.7857142857142859e-06,
5146
+ "loss": 83.3932,
5147
+ "step": 3630
5148
+ },
5149
+ {
5150
+ "epoch": 0.8403227787289684,
5151
+ "grad_norm": 145.125,
5152
+ "learning_rate": 1.7728674203494347e-06,
5153
+ "loss": 84.7285,
5154
+ "step": 3635
5155
+ },
5156
+ {
5157
+ "epoch": 0.8414786560036988,
5158
+ "grad_norm": 147.5,
5159
+ "learning_rate": 1.760020554984584e-06,
5160
+ "loss": 83.1239,
5161
+ "step": 3640
5162
+ },
5163
+ {
5164
+ "epoch": 0.8426345332784292,
5165
+ "grad_norm": 146.125,
5166
+ "learning_rate": 1.747173689619733e-06,
5167
+ "loss": 85.1945,
5168
+ "step": 3645
5169
+ },
5170
+ {
5171
+ "epoch": 0.8437904105531595,
5172
+ "grad_norm": 156.125,
5173
+ "learning_rate": 1.734326824254882e-06,
5174
+ "loss": 83.2979,
5175
+ "step": 3650
5176
+ },
5177
+ {
5178
+ "epoch": 0.8449462878278898,
5179
+ "grad_norm": 146.0,
5180
+ "learning_rate": 1.721479958890031e-06,
5181
+ "loss": 82.1503,
5182
+ "step": 3655
5183
+ },
5184
+ {
5185
+ "epoch": 0.8461021651026203,
5186
+ "grad_norm": 148.0,
5187
+ "learning_rate": 1.7086330935251798e-06,
5188
+ "loss": 83.1019,
5189
+ "step": 3660
5190
+ },
5191
+ {
5192
+ "epoch": 0.8472580423773506,
5193
+ "grad_norm": 143.625,
5194
+ "learning_rate": 1.695786228160329e-06,
5195
+ "loss": 83.5616,
5196
+ "step": 3665
5197
+ },
5198
+ {
5199
+ "epoch": 0.8484139196520809,
5200
+ "grad_norm": 142.75,
5201
+ "learning_rate": 1.682939362795478e-06,
5202
+ "loss": 84.0701,
5203
+ "step": 3670
5204
+ },
5205
+ {
5206
+ "epoch": 0.8495697969268113,
5207
+ "grad_norm": 138.25,
5208
+ "learning_rate": 1.670092497430627e-06,
5209
+ "loss": 82.2154,
5210
+ "step": 3675
5211
+ },
5212
+ {
5213
+ "epoch": 0.8507256742015417,
5214
+ "grad_norm": 137.0,
5215
+ "learning_rate": 1.6572456320657763e-06,
5216
+ "loss": 82.9864,
5217
+ "step": 3680
5218
+ },
5219
+ {
5220
+ "epoch": 0.851881551476272,
5221
+ "grad_norm": 146.125,
5222
+ "learning_rate": 1.6443987667009251e-06,
5223
+ "loss": 82.6712,
5224
+ "step": 3685
5225
+ },
5226
+ {
5227
+ "epoch": 0.8530374287510024,
5228
+ "grad_norm": 142.25,
5229
+ "learning_rate": 1.6315519013360742e-06,
5230
+ "loss": 83.6857,
5231
+ "step": 3690
5232
+ },
5233
+ {
5234
+ "epoch": 0.8541933060257327,
5235
+ "grad_norm": 141.875,
5236
+ "learning_rate": 1.618705035971223e-06,
5237
+ "loss": 82.5429,
5238
+ "step": 3695
5239
+ },
5240
+ {
5241
+ "epoch": 0.8553491833004631,
5242
+ "grad_norm": 152.0,
5243
+ "learning_rate": 1.6058581706063723e-06,
5244
+ "loss": 82.8123,
5245
+ "step": 3700
5246
+ },
5247
+ {
5248
+ "epoch": 0.8565050605751934,
5249
+ "grad_norm": 143.375,
5250
+ "learning_rate": 1.5930113052415211e-06,
5251
+ "loss": 83.5665,
5252
+ "step": 3705
5253
+ },
5254
+ {
5255
+ "epoch": 0.8576609378499238,
5256
+ "grad_norm": 150.875,
5257
+ "learning_rate": 1.5801644398766702e-06,
5258
+ "loss": 83.4201,
5259
+ "step": 3710
5260
+ },
5261
+ {
5262
+ "epoch": 0.8588168151246541,
5263
+ "grad_norm": 153.0,
5264
+ "learning_rate": 1.5673175745118195e-06,
5265
+ "loss": 81.9219,
5266
+ "step": 3715
5267
+ },
5268
+ {
5269
+ "epoch": 0.8599726923993845,
5270
+ "grad_norm": 147.875,
5271
+ "learning_rate": 1.5544707091469683e-06,
5272
+ "loss": 83.8904,
5273
+ "step": 3720
5274
+ },
5275
+ {
5276
+ "epoch": 0.8611285696741149,
5277
+ "grad_norm": 137.125,
5278
+ "learning_rate": 1.5416238437821174e-06,
5279
+ "loss": 82.7793,
5280
+ "step": 3725
5281
+ },
5282
+ {
5283
+ "epoch": 0.8622844469488452,
5284
+ "grad_norm": 146.25,
5285
+ "learning_rate": 1.5287769784172662e-06,
5286
+ "loss": 83.6293,
5287
+ "step": 3730
5288
+ },
5289
+ {
5290
+ "epoch": 0.8634403242235755,
5291
+ "grad_norm": 139.625,
5292
+ "learning_rate": 1.5159301130524153e-06,
5293
+ "loss": 81.7332,
5294
+ "step": 3735
5295
+ },
5296
+ {
5297
+ "epoch": 0.864596201498306,
5298
+ "grad_norm": 152.125,
5299
+ "learning_rate": 1.5030832476875643e-06,
5300
+ "loss": 82.7963,
5301
+ "step": 3740
5302
+ },
5303
+ {
5304
+ "epoch": 0.8657520787730363,
5305
+ "grad_norm": 149.875,
5306
+ "learning_rate": 1.4902363823227134e-06,
5307
+ "loss": 83.1981,
5308
+ "step": 3745
5309
+ },
5310
+ {
5311
+ "epoch": 0.8669079560477666,
5312
+ "grad_norm": 136.0,
5313
+ "learning_rate": 1.4773895169578625e-06,
5314
+ "loss": 83.8111,
5315
+ "step": 3750
5316
+ },
5317
+ {
5318
+ "epoch": 0.868063833322497,
5319
+ "grad_norm": 150.5,
5320
+ "learning_rate": 1.4645426515930113e-06,
5321
+ "loss": 83.2902,
5322
+ "step": 3755
5323
+ },
5324
+ {
5325
+ "epoch": 0.8692197105972274,
5326
+ "grad_norm": 138.375,
5327
+ "learning_rate": 1.4516957862281606e-06,
5328
+ "loss": 84.2568,
5329
+ "step": 3760
5330
+ },
5331
+ {
5332
+ "epoch": 0.8703755878719577,
5333
+ "grad_norm": 139.125,
5334
+ "learning_rate": 1.4388489208633094e-06,
5335
+ "loss": 82.5677,
5336
+ "step": 3765
5337
+ },
5338
+ {
5339
+ "epoch": 0.871531465146688,
5340
+ "grad_norm": 147.0,
5341
+ "learning_rate": 1.4260020554984585e-06,
5342
+ "loss": 82.9927,
5343
+ "step": 3770
5344
+ },
5345
+ {
5346
+ "epoch": 0.8726873424214184,
5347
+ "grad_norm": 149.0,
5348
+ "learning_rate": 1.4131551901336073e-06,
5349
+ "loss": 82.7173,
5350
+ "step": 3775
5351
+ },
5352
+ {
5353
+ "epoch": 0.8738432196961488,
5354
+ "grad_norm": 147.5,
5355
+ "learning_rate": 1.4003083247687566e-06,
5356
+ "loss": 83.3356,
5357
+ "step": 3780
5358
+ },
5359
+ {
5360
+ "epoch": 0.8749990969708791,
5361
+ "grad_norm": 143.625,
5362
+ "learning_rate": 1.3874614594039057e-06,
5363
+ "loss": 81.6103,
5364
+ "step": 3785
5365
+ },
5366
+ {
5367
+ "epoch": 0.8761549742456095,
5368
+ "grad_norm": 140.0,
5369
+ "learning_rate": 1.3746145940390545e-06,
5370
+ "loss": 81.7399,
5371
+ "step": 3790
5372
+ },
5373
+ {
5374
+ "epoch": 0.8773108515203398,
5375
+ "grad_norm": 142.125,
5376
+ "learning_rate": 1.3617677286742038e-06,
5377
+ "loss": 82.5157,
5378
+ "step": 3795
5379
+ },
5380
+ {
5381
+ "epoch": 0.8784667287950702,
5382
+ "grad_norm": 143.625,
5383
+ "learning_rate": 1.3489208633093526e-06,
5384
+ "loss": 83.8571,
5385
+ "step": 3800
5386
+ },
5387
+ {
5388
+ "epoch": 0.8796226060698006,
5389
+ "grad_norm": 140.75,
5390
+ "learning_rate": 1.3360739979445017e-06,
5391
+ "loss": 83.0828,
5392
+ "step": 3805
5393
+ },
5394
+ {
5395
+ "epoch": 0.8807784833445309,
5396
+ "grad_norm": 135.375,
5397
+ "learning_rate": 1.3232271325796505e-06,
5398
+ "loss": 82.2537,
5399
+ "step": 3810
5400
+ },
5401
+ {
5402
+ "epoch": 0.8819343606192612,
5403
+ "grad_norm": 143.75,
5404
+ "learning_rate": 1.3103802672147998e-06,
5405
+ "loss": 82.8528,
5406
+ "step": 3815
5407
+ },
5408
+ {
5409
+ "epoch": 0.8830902378939917,
5410
+ "grad_norm": 140.0,
5411
+ "learning_rate": 1.2975334018499486e-06,
5412
+ "loss": 83.4548,
5413
+ "step": 3820
5414
+ },
5415
+ {
5416
+ "epoch": 0.884246115168722,
5417
+ "grad_norm": 137.5,
5418
+ "learning_rate": 1.2846865364850977e-06,
5419
+ "loss": 82.9603,
5420
+ "step": 3825
5421
+ },
5422
+ {
5423
+ "epoch": 0.8854019924434523,
5424
+ "grad_norm": 143.75,
5425
+ "learning_rate": 1.2718396711202468e-06,
5426
+ "loss": 82.612,
5427
+ "step": 3830
5428
+ },
5429
+ {
5430
+ "epoch": 0.8865578697181826,
5431
+ "grad_norm": 134.75,
5432
+ "learning_rate": 1.2589928057553958e-06,
5433
+ "loss": 81.9109,
5434
+ "step": 3835
5435
+ },
5436
+ {
5437
+ "epoch": 0.887713746992913,
5438
+ "grad_norm": 149.875,
5439
+ "learning_rate": 1.2461459403905449e-06,
5440
+ "loss": 83.5588,
5441
+ "step": 3840
5442
+ },
5443
+ {
5444
+ "epoch": 0.8888696242676434,
5445
+ "grad_norm": 140.125,
5446
+ "learning_rate": 1.233299075025694e-06,
5447
+ "loss": 81.9114,
5448
+ "step": 3845
5449
+ },
5450
+ {
5451
+ "epoch": 0.8900255015423737,
5452
+ "grad_norm": 140.875,
5453
+ "learning_rate": 1.2204522096608428e-06,
5454
+ "loss": 82.8397,
5455
+ "step": 3850
5456
+ },
5457
+ {
5458
+ "epoch": 0.8911813788171041,
5459
+ "grad_norm": 133.75,
5460
+ "learning_rate": 1.2076053442959918e-06,
5461
+ "loss": 83.9099,
5462
+ "step": 3855
5463
+ },
5464
+ {
5465
+ "epoch": 0.8923372560918345,
5466
+ "grad_norm": 142.75,
5467
+ "learning_rate": 1.194758478931141e-06,
5468
+ "loss": 83.4947,
5469
+ "step": 3860
5470
+ },
5471
+ {
5472
+ "epoch": 0.8934931333665648,
5473
+ "grad_norm": 144.25,
5474
+ "learning_rate": 1.18191161356629e-06,
5475
+ "loss": 83.0275,
5476
+ "step": 3865
5477
+ },
5478
+ {
5479
+ "epoch": 0.8946490106412952,
5480
+ "grad_norm": 157.0,
5481
+ "learning_rate": 1.1690647482014388e-06,
5482
+ "loss": 81.9333,
5483
+ "step": 3870
5484
+ },
5485
+ {
5486
+ "epoch": 0.8958048879160255,
5487
+ "grad_norm": 136.5,
5488
+ "learning_rate": 1.156217882836588e-06,
5489
+ "loss": 81.9801,
5490
+ "step": 3875
5491
+ },
5492
+ {
5493
+ "epoch": 0.8969607651907558,
5494
+ "grad_norm": 141.375,
5495
+ "learning_rate": 1.1433710174717371e-06,
5496
+ "loss": 82.9318,
5497
+ "step": 3880
5498
+ },
5499
+ {
5500
+ "epoch": 0.8981166424654863,
5501
+ "grad_norm": 140.625,
5502
+ "learning_rate": 1.130524152106886e-06,
5503
+ "loss": 82.2715,
5504
+ "step": 3885
5505
+ },
5506
+ {
5507
+ "epoch": 0.8992725197402166,
5508
+ "grad_norm": 139.25,
5509
+ "learning_rate": 1.117677286742035e-06,
5510
+ "loss": 82.4903,
5511
+ "step": 3890
5512
+ },
5513
+ {
5514
+ "epoch": 0.9004283970149469,
5515
+ "grad_norm": 140.0,
5516
+ "learning_rate": 1.1048304213771841e-06,
5517
+ "loss": 82.3743,
5518
+ "step": 3895
5519
+ },
5520
+ {
5521
+ "epoch": 0.9015842742896772,
5522
+ "grad_norm": 141.875,
5523
+ "learning_rate": 1.091983556012333e-06,
5524
+ "loss": 79.7922,
5525
+ "step": 3900
5526
+ },
5527
+ {
5528
+ "epoch": 0.9027401515644077,
5529
+ "grad_norm": 140.125,
5530
+ "learning_rate": 1.079136690647482e-06,
5531
+ "loss": 80.9088,
5532
+ "step": 3905
5533
+ },
5534
+ {
5535
+ "epoch": 0.903896028839138,
5536
+ "grad_norm": 139.5,
5537
+ "learning_rate": 1.066289825282631e-06,
5538
+ "loss": 82.9754,
5539
+ "step": 3910
5540
+ },
5541
+ {
5542
+ "epoch": 0.9050519061138683,
5543
+ "grad_norm": 135.25,
5544
+ "learning_rate": 1.0534429599177801e-06,
5545
+ "loss": 82.7032,
5546
+ "step": 3915
5547
+ },
5548
+ {
5549
+ "epoch": 0.9062077833885988,
5550
+ "grad_norm": 138.5,
5551
+ "learning_rate": 1.0405960945529292e-06,
5552
+ "loss": 80.908,
5553
+ "step": 3920
5554
+ },
5555
+ {
5556
+ "epoch": 0.9073636606633291,
5557
+ "grad_norm": 151.25,
5558
+ "learning_rate": 1.0277492291880783e-06,
5559
+ "loss": 82.3395,
5560
+ "step": 3925
5561
+ },
5562
+ {
5563
+ "epoch": 0.9085195379380594,
5564
+ "grad_norm": 141.625,
5565
+ "learning_rate": 1.0149023638232273e-06,
5566
+ "loss": 81.7353,
5567
+ "step": 3930
5568
+ },
5569
+ {
5570
+ "epoch": 0.9096754152127898,
5571
+ "grad_norm": 154.375,
5572
+ "learning_rate": 1.0020554984583762e-06,
5573
+ "loss": 81.4314,
5574
+ "step": 3935
5575
+ },
5576
+ {
5577
+ "epoch": 0.9108312924875201,
5578
+ "grad_norm": 137.625,
5579
+ "learning_rate": 9.892086330935252e-07,
5580
+ "loss": 82.1363,
5581
+ "step": 3940
5582
+ },
5583
+ {
5584
+ "epoch": 0.9119871697622505,
5585
+ "grad_norm": 141.0,
5586
+ "learning_rate": 9.763617677286743e-07,
5587
+ "loss": 83.2808,
5588
+ "step": 3945
5589
+ },
5590
+ {
5591
+ "epoch": 0.9131430470369809,
5592
+ "grad_norm": 140.25,
5593
+ "learning_rate": 9.635149023638233e-07,
5594
+ "loss": 81.7309,
5595
+ "step": 3950
5596
+ },
5597
+ {
5598
+ "epoch": 0.9142989243117112,
5599
+ "grad_norm": 136.125,
5600
+ "learning_rate": 9.506680369989724e-07,
5601
+ "loss": 80.9205,
5602
+ "step": 3955
5603
+ },
5604
+ {
5605
+ "epoch": 0.9154548015864415,
5606
+ "grad_norm": 140.25,
5607
+ "learning_rate": 9.378211716341213e-07,
5608
+ "loss": 83.6719,
5609
+ "step": 3960
5610
+ },
5611
+ {
5612
+ "epoch": 0.9166106788611719,
5613
+ "grad_norm": 142.5,
5614
+ "learning_rate": 9.249743062692704e-07,
5615
+ "loss": 82.9615,
5616
+ "step": 3965
5617
+ },
5618
+ {
5619
+ "epoch": 0.9177665561359023,
5620
+ "grad_norm": 137.125,
5621
+ "learning_rate": 9.121274409044194e-07,
5622
+ "loss": 83.3444,
5623
+ "step": 3970
5624
+ },
5625
+ {
5626
+ "epoch": 0.9189224334106326,
5627
+ "grad_norm": 137.75,
5628
+ "learning_rate": 8.992805755395684e-07,
5629
+ "loss": 82.6694,
5630
+ "step": 3975
5631
+ },
5632
+ {
5633
+ "epoch": 0.9200783106853629,
5634
+ "grad_norm": 142.0,
5635
+ "learning_rate": 8.864337101747174e-07,
5636
+ "loss": 81.7997,
5637
+ "step": 3980
5638
+ },
5639
+ {
5640
+ "epoch": 0.9212341879600934,
5641
+ "grad_norm": 138.875,
5642
+ "learning_rate": 8.735868448098665e-07,
5643
+ "loss": 81.687,
5644
+ "step": 3985
5645
+ },
5646
+ {
5647
+ "epoch": 0.9223900652348237,
5648
+ "grad_norm": 135.0,
5649
+ "learning_rate": 8.607399794450155e-07,
5650
+ "loss": 81.6286,
5651
+ "step": 3990
5652
+ },
5653
+ {
5654
+ "epoch": 0.923545942509554,
5655
+ "grad_norm": 140.625,
5656
+ "learning_rate": 8.478931140801645e-07,
5657
+ "loss": 80.7711,
5658
+ "step": 3995
5659
+ },
5660
+ {
5661
+ "epoch": 0.9247018197842845,
5662
+ "grad_norm": 149.375,
5663
+ "learning_rate": 8.350462487153135e-07,
5664
+ "loss": 83.3313,
5665
+ "step": 4000
5666
+ },
5667
+ {
5668
+ "epoch": 0.9247018197842845,
5669
+ "eval_loss": NaN,
5670
+ "eval_runtime": 381.0966,
5671
+ "eval_samples_per_second": 611.745,
5672
+ "eval_steps_per_second": 38.234,
5673
+ "step": 4000
5674
  }
5675
  ],
5676
  "logging_steps": 5,
 
5690
  "attributes": {}
5691
  }
5692
  },
5693
+ "total_flos": 1.7329431971155149e+19,
5694
  "train_batch_size": 4,
5695
  "trial_name": null,
5696
  "trial_params": null