CocoRoF commited on
Commit
ad1cf27
·
verified ·
1 Parent(s): f92ebeb

Training in progress, step 12500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dbfded01e29c2f16226927197c7b53cb17e6b0e25f4e77f11587c6e4e8cecdca
3
  size 737632172
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be61347e986df1813f8514099ca7495d01a0aaff2cb1086c995e9f56e4864f44
3
  size 737632172
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:51c9c2501538e245b7dc88214c50a178f28922e91dc78a7635e1dfef030205c3
3
  size 1475354682
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f13242f121b2c14586fd1d56f596180d4a87cf3c3e6a8b06d2aa80aac67af52
3
  size 1475354682
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1a2fbcd26bac3ea7dc02fc9ede5b8a1914ca51611473722a11a969e1f26ac0ee
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b0169a1fcfdb795965595a4f242f88323799f2590a349006ef637e474b948bd8
3
  size 15984
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:66d97b511d2fdb8061e5bf72c139923941c148260fac1caedd654028da6986c1
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c87b1e14f1bde64851a26e79d9e7529d68eb1e143f87cb05c0bdf4c84c676fd8
3
  size 15984
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3839473129eb8c438ab312370daa55eb10a0790f33d38fc5eaa24859b54b0d1f
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0bea1a1e24ce9ba044268ec704e9b5435b962ac6a2de09e9847caeff5397ad96
3
  size 15984
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5088a0d34c7015afe60457fbb3f0a4740839369017a42ea4b3250322c2d63ceb
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ec0b58b8e0950d68e3cbb11f67305b0912f5521574e32c474ae22410e7fadc8
3
  size 15984
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f9cac0eb25286b75549fa2030810940adf357064a83facaf5c58ebe37190b6ac
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:284664914b90ff8b8a0dc92bb3d3f63cfa784487322d7dc115bd6038f6758aca
3
  size 15984
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f0a57d29811122d52bd53f81af680412b91dde1cd2a12fa885d8a54388be8e2d
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0468917ff18a692d9dedf8d79fa5e11dd93feec07877799ca86aca3ed690d129
3
  size 15984
last-checkpoint/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2c90ab29b255eaf920ecc1cba0b586e426f8e2db67b44a65576693f84178a04f
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:842bb0cfac2bb06e2e811dabf1e415d78f36184efada16b4098faa08d32c3580
3
  size 15984
last-checkpoint/rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d4efbfa3cfb1bb8fb9c3380e65959a8b4eaf3bceb0507a26ffba1a3e4636ddb1
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca857437aedb445e7bf3dd47069eecac538c7d7fd16d601188beef14a54e520e
3
  size 15984
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b579900d94a8c528190bb9fc0315439f3c057f344b31a3968eaa60ed56b9c9f5
3
  size 1000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:123bdb81188a6d8339925f843c784a6596cb7ed0221abdafc8c5e0e110c82c27
3
  size 1000
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.44380093309146185,
5
  "eval_steps": 1000,
6
- "global_step": 10000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -7087,6 +7087,1772 @@
7087
  "eval_samples_per_second": 1802.303,
7088
  "eval_steps_per_second": 56.322,
7089
  "step": 10000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7090
  }
7091
  ],
7092
  "logging_steps": 10,
@@ -7106,7 +8872,7 @@
7106
  "attributes": {}
7107
  }
7108
  },
7109
- "total_flos": 3.489723205025792e+18,
7110
  "train_batch_size": 4,
7111
  "trial_name": null,
7112
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.5547511663643273,
5
  "eval_steps": 1000,
6
+ "global_step": 12500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
7087
  "eval_samples_per_second": 1802.303,
7088
  "eval_steps_per_second": 56.322,
7089
  "step": 10000
7090
+ },
7091
+ {
7092
+ "epoch": 0.4442447340245533,
7093
+ "grad_norm": 87.5740966796875,
7094
+ "learning_rate": 9.98264670211062e-06,
7095
+ "loss": 11.53,
7096
+ "step": 10010
7097
+ },
7098
+ {
7099
+ "epoch": 0.44468853495764477,
7100
+ "grad_norm": 79.04003143310547,
7101
+ "learning_rate": 9.982629366148691e-06,
7102
+ "loss": 11.2839,
7103
+ "step": 10020
7104
+ },
7105
+ {
7106
+ "epoch": 0.4451323358907362,
7107
+ "grad_norm": 93.09468841552734,
7108
+ "learning_rate": 9.982612030186765e-06,
7109
+ "loss": 11.685,
7110
+ "step": 10030
7111
+ },
7112
+ {
7113
+ "epoch": 0.4455761368238277,
7114
+ "grad_norm": 75.00133514404297,
7115
+ "learning_rate": 9.982594694224838e-06,
7116
+ "loss": 10.8954,
7117
+ "step": 10040
7118
+ },
7119
+ {
7120
+ "epoch": 0.4460199377569191,
7121
+ "grad_norm": 95.3443374633789,
7122
+ "learning_rate": 9.982577358262909e-06,
7123
+ "loss": 10.8405,
7124
+ "step": 10050
7125
+ },
7126
+ {
7127
+ "epoch": 0.4464637386900106,
7128
+ "grad_norm": 81.28312683105469,
7129
+ "learning_rate": 9.982560022300982e-06,
7130
+ "loss": 10.8736,
7131
+ "step": 10060
7132
+ },
7133
+ {
7134
+ "epoch": 0.44690753962310203,
7135
+ "grad_norm": 84.82075500488281,
7136
+ "learning_rate": 9.982542686339055e-06,
7137
+ "loss": 10.9681,
7138
+ "step": 10070
7139
+ },
7140
+ {
7141
+ "epoch": 0.4473513405561935,
7142
+ "grad_norm": 83.95282745361328,
7143
+ "learning_rate": 9.982525350377127e-06,
7144
+ "loss": 11.012,
7145
+ "step": 10080
7146
+ },
7147
+ {
7148
+ "epoch": 0.44779514148928495,
7149
+ "grad_norm": 77.86396026611328,
7150
+ "learning_rate": 9.9825080144152e-06,
7151
+ "loss": 11.2847,
7152
+ "step": 10090
7153
+ },
7154
+ {
7155
+ "epoch": 0.44823894242237644,
7156
+ "grad_norm": 87.20881652832031,
7157
+ "learning_rate": 9.982490678453273e-06,
7158
+ "loss": 11.2584,
7159
+ "step": 10100
7160
+ },
7161
+ {
7162
+ "epoch": 0.44868274335546793,
7163
+ "grad_norm": 100.79487609863281,
7164
+ "learning_rate": 9.982473342491344e-06,
7165
+ "loss": 11.8089,
7166
+ "step": 10110
7167
+ },
7168
+ {
7169
+ "epoch": 0.44912654428855936,
7170
+ "grad_norm": 81.89049530029297,
7171
+ "learning_rate": 9.982456006529417e-06,
7172
+ "loss": 11.5939,
7173
+ "step": 10120
7174
+ },
7175
+ {
7176
+ "epoch": 0.44957034522165085,
7177
+ "grad_norm": 91.9642562866211,
7178
+ "learning_rate": 9.98243867056749e-06,
7179
+ "loss": 10.6203,
7180
+ "step": 10130
7181
+ },
7182
+ {
7183
+ "epoch": 0.4500141461547423,
7184
+ "grad_norm": 98.22434997558594,
7185
+ "learning_rate": 9.982421334605562e-06,
7186
+ "loss": 10.6234,
7187
+ "step": 10140
7188
+ },
7189
+ {
7190
+ "epoch": 0.45045794708783377,
7191
+ "grad_norm": 92.3149185180664,
7192
+ "learning_rate": 9.982403998643635e-06,
7193
+ "loss": 11.1096,
7194
+ "step": 10150
7195
+ },
7196
+ {
7197
+ "epoch": 0.4509017480209252,
7198
+ "grad_norm": 96.91551971435547,
7199
+ "learning_rate": 9.982386662681708e-06,
7200
+ "loss": 11.3593,
7201
+ "step": 10160
7202
+ },
7203
+ {
7204
+ "epoch": 0.4513455489540167,
7205
+ "grad_norm": 77.00688934326172,
7206
+ "learning_rate": 9.982369326719781e-06,
7207
+ "loss": 11.0639,
7208
+ "step": 10170
7209
+ },
7210
+ {
7211
+ "epoch": 0.4517893498871081,
7212
+ "grad_norm": 81.16304016113281,
7213
+ "learning_rate": 9.982351990757852e-06,
7214
+ "loss": 11.2663,
7215
+ "step": 10180
7216
+ },
7217
+ {
7218
+ "epoch": 0.4522331508201996,
7219
+ "grad_norm": 79.47703552246094,
7220
+ "learning_rate": 9.982334654795925e-06,
7221
+ "loss": 10.8026,
7222
+ "step": 10190
7223
+ },
7224
+ {
7225
+ "epoch": 0.45267695175329103,
7226
+ "grad_norm": 95.82781219482422,
7227
+ "learning_rate": 9.982317318833998e-06,
7228
+ "loss": 10.9343,
7229
+ "step": 10200
7230
+ },
7231
+ {
7232
+ "epoch": 0.4531207526863825,
7233
+ "grad_norm": 86.30982208251953,
7234
+ "learning_rate": 9.98229998287207e-06,
7235
+ "loss": 10.8559,
7236
+ "step": 10210
7237
+ },
7238
+ {
7239
+ "epoch": 0.453564553619474,
7240
+ "grad_norm": 84.82122802734375,
7241
+ "learning_rate": 9.982282646910143e-06,
7242
+ "loss": 11.2375,
7243
+ "step": 10220
7244
+ },
7245
+ {
7246
+ "epoch": 0.45400835455256544,
7247
+ "grad_norm": 85.73993682861328,
7248
+ "learning_rate": 9.982265310948216e-06,
7249
+ "loss": 11.1234,
7250
+ "step": 10230
7251
+ },
7252
+ {
7253
+ "epoch": 0.45445215548565693,
7254
+ "grad_norm": 76.42994689941406,
7255
+ "learning_rate": 9.982247974986287e-06,
7256
+ "loss": 11.2003,
7257
+ "step": 10240
7258
+ },
7259
+ {
7260
+ "epoch": 0.45489595641874836,
7261
+ "grad_norm": 84.67989349365234,
7262
+ "learning_rate": 9.98223063902436e-06,
7263
+ "loss": 10.8626,
7264
+ "step": 10250
7265
+ },
7266
+ {
7267
+ "epoch": 0.45533975735183985,
7268
+ "grad_norm": 97.71283721923828,
7269
+ "learning_rate": 9.982213303062434e-06,
7270
+ "loss": 10.8125,
7271
+ "step": 10260
7272
+ },
7273
+ {
7274
+ "epoch": 0.4557835582849313,
7275
+ "grad_norm": 77.48724365234375,
7276
+ "learning_rate": 9.982195967100505e-06,
7277
+ "loss": 10.9105,
7278
+ "step": 10270
7279
+ },
7280
+ {
7281
+ "epoch": 0.45622735921802277,
7282
+ "grad_norm": 73.06230926513672,
7283
+ "learning_rate": 9.982178631138578e-06,
7284
+ "loss": 10.8443,
7285
+ "step": 10280
7286
+ },
7287
+ {
7288
+ "epoch": 0.4566711601511142,
7289
+ "grad_norm": 78.53364562988281,
7290
+ "learning_rate": 9.982161295176651e-06,
7291
+ "loss": 11.1985,
7292
+ "step": 10290
7293
+ },
7294
+ {
7295
+ "epoch": 0.4571149610842057,
7296
+ "grad_norm": 71.37035369873047,
7297
+ "learning_rate": 9.982143959214722e-06,
7298
+ "loss": 11.2463,
7299
+ "step": 10300
7300
+ },
7301
+ {
7302
+ "epoch": 0.4575587620172971,
7303
+ "grad_norm": 76.0063247680664,
7304
+ "learning_rate": 9.982126623252796e-06,
7305
+ "loss": 10.9098,
7306
+ "step": 10310
7307
+ },
7308
+ {
7309
+ "epoch": 0.4580025629503886,
7310
+ "grad_norm": 82.43836212158203,
7311
+ "learning_rate": 9.982109287290869e-06,
7312
+ "loss": 10.8438,
7313
+ "step": 10320
7314
+ },
7315
+ {
7316
+ "epoch": 0.4584463638834801,
7317
+ "grad_norm": 88.15770721435547,
7318
+ "learning_rate": 9.98209195132894e-06,
7319
+ "loss": 11.2367,
7320
+ "step": 10330
7321
+ },
7322
+ {
7323
+ "epoch": 0.4588901648165715,
7324
+ "grad_norm": 78.84662628173828,
7325
+ "learning_rate": 9.982074615367013e-06,
7326
+ "loss": 10.8668,
7327
+ "step": 10340
7328
+ },
7329
+ {
7330
+ "epoch": 0.459333965749663,
7331
+ "grad_norm": 93.12088012695312,
7332
+ "learning_rate": 9.982057279405086e-06,
7333
+ "loss": 11.3541,
7334
+ "step": 10350
7335
+ },
7336
+ {
7337
+ "epoch": 0.45977776668275444,
7338
+ "grad_norm": 80.99417114257812,
7339
+ "learning_rate": 9.982039943443158e-06,
7340
+ "loss": 11.7431,
7341
+ "step": 10360
7342
+ },
7343
+ {
7344
+ "epoch": 0.46022156761584593,
7345
+ "grad_norm": 78.97187805175781,
7346
+ "learning_rate": 9.98202260748123e-06,
7347
+ "loss": 11.2636,
7348
+ "step": 10370
7349
+ },
7350
+ {
7351
+ "epoch": 0.46066536854893736,
7352
+ "grad_norm": 77.9780502319336,
7353
+ "learning_rate": 9.982005271519304e-06,
7354
+ "loss": 11.3237,
7355
+ "step": 10380
7356
+ },
7357
+ {
7358
+ "epoch": 0.46110916948202885,
7359
+ "grad_norm": 94.6609115600586,
7360
+ "learning_rate": 9.981987935557377e-06,
7361
+ "loss": 11.1363,
7362
+ "step": 10390
7363
+ },
7364
+ {
7365
+ "epoch": 0.4615529704151203,
7366
+ "grad_norm": 94.51988220214844,
7367
+ "learning_rate": 9.981970599595448e-06,
7368
+ "loss": 11.1418,
7369
+ "step": 10400
7370
+ },
7371
+ {
7372
+ "epoch": 0.46199677134821177,
7373
+ "grad_norm": 89.58201599121094,
7374
+ "learning_rate": 9.981953263633521e-06,
7375
+ "loss": 11.0321,
7376
+ "step": 10410
7377
+ },
7378
+ {
7379
+ "epoch": 0.4624405722813032,
7380
+ "grad_norm": 88.75037384033203,
7381
+ "learning_rate": 9.981935927671594e-06,
7382
+ "loss": 11.1615,
7383
+ "step": 10420
7384
+ },
7385
+ {
7386
+ "epoch": 0.4628843732143947,
7387
+ "grad_norm": 72.32737731933594,
7388
+ "learning_rate": 9.981918591709666e-06,
7389
+ "loss": 10.7545,
7390
+ "step": 10430
7391
+ },
7392
+ {
7393
+ "epoch": 0.46332817414748617,
7394
+ "grad_norm": 94.58203887939453,
7395
+ "learning_rate": 9.981901255747739e-06,
7396
+ "loss": 10.8112,
7397
+ "step": 10440
7398
+ },
7399
+ {
7400
+ "epoch": 0.4637719750805776,
7401
+ "grad_norm": 88.47208404541016,
7402
+ "learning_rate": 9.981883919785812e-06,
7403
+ "loss": 11.0896,
7404
+ "step": 10450
7405
+ },
7406
+ {
7407
+ "epoch": 0.4642157760136691,
7408
+ "grad_norm": 86.8807601928711,
7409
+ "learning_rate": 9.981866583823883e-06,
7410
+ "loss": 10.6616,
7411
+ "step": 10460
7412
+ },
7413
+ {
7414
+ "epoch": 0.4646595769467605,
7415
+ "grad_norm": 67.94696807861328,
7416
+ "learning_rate": 9.981849247861956e-06,
7417
+ "loss": 10.5705,
7418
+ "step": 10470
7419
+ },
7420
+ {
7421
+ "epoch": 0.465103377879852,
7422
+ "grad_norm": 83.57070922851562,
7423
+ "learning_rate": 9.98183191190003e-06,
7424
+ "loss": 11.4698,
7425
+ "step": 10480
7426
+ },
7427
+ {
7428
+ "epoch": 0.46554717881294344,
7429
+ "grad_norm": 83.02405548095703,
7430
+ "learning_rate": 9.9818145759381e-06,
7431
+ "loss": 10.5724,
7432
+ "step": 10490
7433
+ },
7434
+ {
7435
+ "epoch": 0.46599097974603493,
7436
+ "grad_norm": 83.82646942138672,
7437
+ "learning_rate": 9.981797239976174e-06,
7438
+ "loss": 11.0542,
7439
+ "step": 10500
7440
+ },
7441
+ {
7442
+ "epoch": 0.46643478067912636,
7443
+ "grad_norm": 77.04849243164062,
7444
+ "learning_rate": 9.981779904014247e-06,
7445
+ "loss": 11.3302,
7446
+ "step": 10510
7447
+ },
7448
+ {
7449
+ "epoch": 0.46687858161221785,
7450
+ "grad_norm": 74.8342514038086,
7451
+ "learning_rate": 9.981762568052318e-06,
7452
+ "loss": 11.5481,
7453
+ "step": 10520
7454
+ },
7455
+ {
7456
+ "epoch": 0.4673223825453093,
7457
+ "grad_norm": 76.42134094238281,
7458
+ "learning_rate": 9.981745232090391e-06,
7459
+ "loss": 10.9186,
7460
+ "step": 10530
7461
+ },
7462
+ {
7463
+ "epoch": 0.46776618347840077,
7464
+ "grad_norm": 82.68692779541016,
7465
+ "learning_rate": 9.981727896128464e-06,
7466
+ "loss": 11.2538,
7467
+ "step": 10540
7468
+ },
7469
+ {
7470
+ "epoch": 0.46820998441149225,
7471
+ "grad_norm": 69.0387191772461,
7472
+ "learning_rate": 9.981710560166536e-06,
7473
+ "loss": 10.8114,
7474
+ "step": 10550
7475
+ },
7476
+ {
7477
+ "epoch": 0.4686537853445837,
7478
+ "grad_norm": 103.88386535644531,
7479
+ "learning_rate": 9.981693224204609e-06,
7480
+ "loss": 10.9474,
7481
+ "step": 10560
7482
+ },
7483
+ {
7484
+ "epoch": 0.46909758627767517,
7485
+ "grad_norm": 88.8089370727539,
7486
+ "learning_rate": 9.981675888242682e-06,
7487
+ "loss": 10.8754,
7488
+ "step": 10570
7489
+ },
7490
+ {
7491
+ "epoch": 0.4695413872107666,
7492
+ "grad_norm": 79.1522216796875,
7493
+ "learning_rate": 9.981658552280753e-06,
7494
+ "loss": 11.2002,
7495
+ "step": 10580
7496
+ },
7497
+ {
7498
+ "epoch": 0.4699851881438581,
7499
+ "grad_norm": 75.01962280273438,
7500
+ "learning_rate": 9.981641216318826e-06,
7501
+ "loss": 10.5971,
7502
+ "step": 10590
7503
+ },
7504
+ {
7505
+ "epoch": 0.4704289890769495,
7506
+ "grad_norm": 88.03787994384766,
7507
+ "learning_rate": 9.9816238803569e-06,
7508
+ "loss": 11.2155,
7509
+ "step": 10600
7510
+ },
7511
+ {
7512
+ "epoch": 0.470872790010041,
7513
+ "grad_norm": 88.27069854736328,
7514
+ "learning_rate": 9.981606544394973e-06,
7515
+ "loss": 11.1796,
7516
+ "step": 10610
7517
+ },
7518
+ {
7519
+ "epoch": 0.47131659094313244,
7520
+ "grad_norm": 91.5178451538086,
7521
+ "learning_rate": 9.981589208433044e-06,
7522
+ "loss": 10.8964,
7523
+ "step": 10620
7524
+ },
7525
+ {
7526
+ "epoch": 0.47176039187622393,
7527
+ "grad_norm": 89.97875213623047,
7528
+ "learning_rate": 9.981571872471117e-06,
7529
+ "loss": 11.0617,
7530
+ "step": 10630
7531
+ },
7532
+ {
7533
+ "epoch": 0.47220419280931536,
7534
+ "grad_norm": 77.71656036376953,
7535
+ "learning_rate": 9.98155453650919e-06,
7536
+ "loss": 11.2671,
7537
+ "step": 10640
7538
+ },
7539
+ {
7540
+ "epoch": 0.47264799374240685,
7541
+ "grad_norm": 90.4183120727539,
7542
+ "learning_rate": 9.981537200547262e-06,
7543
+ "loss": 10.8842,
7544
+ "step": 10650
7545
+ },
7546
+ {
7547
+ "epoch": 0.4730917946754983,
7548
+ "grad_norm": 109.40415954589844,
7549
+ "learning_rate": 9.981519864585335e-06,
7550
+ "loss": 10.8791,
7551
+ "step": 10660
7552
+ },
7553
+ {
7554
+ "epoch": 0.47353559560858977,
7555
+ "grad_norm": 79.65886688232422,
7556
+ "learning_rate": 9.981502528623408e-06,
7557
+ "loss": 11.3155,
7558
+ "step": 10670
7559
+ },
7560
+ {
7561
+ "epoch": 0.47397939654168125,
7562
+ "grad_norm": 75.3977279663086,
7563
+ "learning_rate": 9.981485192661479e-06,
7564
+ "loss": 10.6279,
7565
+ "step": 10680
7566
+ },
7567
+ {
7568
+ "epoch": 0.4744231974747727,
7569
+ "grad_norm": 90.0768051147461,
7570
+ "learning_rate": 9.981467856699552e-06,
7571
+ "loss": 10.8386,
7572
+ "step": 10690
7573
+ },
7574
+ {
7575
+ "epoch": 0.47486699840786417,
7576
+ "grad_norm": 78.82632446289062,
7577
+ "learning_rate": 9.981450520737625e-06,
7578
+ "loss": 10.8152,
7579
+ "step": 10700
7580
+ },
7581
+ {
7582
+ "epoch": 0.4753107993409556,
7583
+ "grad_norm": 82.59823608398438,
7584
+ "learning_rate": 9.981433184775697e-06,
7585
+ "loss": 11.0986,
7586
+ "step": 10710
7587
+ },
7588
+ {
7589
+ "epoch": 0.4757546002740471,
7590
+ "grad_norm": 82.44734954833984,
7591
+ "learning_rate": 9.98141584881377e-06,
7592
+ "loss": 10.8024,
7593
+ "step": 10720
7594
+ },
7595
+ {
7596
+ "epoch": 0.4761984012071385,
7597
+ "grad_norm": 91.71231079101562,
7598
+ "learning_rate": 9.981398512851843e-06,
7599
+ "loss": 10.8372,
7600
+ "step": 10730
7601
+ },
7602
+ {
7603
+ "epoch": 0.47664220214023,
7604
+ "grad_norm": 87.06108093261719,
7605
+ "learning_rate": 9.981381176889914e-06,
7606
+ "loss": 11.0266,
7607
+ "step": 10740
7608
+ },
7609
+ {
7610
+ "epoch": 0.47708600307332144,
7611
+ "grad_norm": 90.88961029052734,
7612
+ "learning_rate": 9.981363840927987e-06,
7613
+ "loss": 10.607,
7614
+ "step": 10750
7615
+ },
7616
+ {
7617
+ "epoch": 0.47752980400641293,
7618
+ "grad_norm": 90.09719848632812,
7619
+ "learning_rate": 9.98134650496606e-06,
7620
+ "loss": 11.0468,
7621
+ "step": 10760
7622
+ },
7623
+ {
7624
+ "epoch": 0.47797360493950436,
7625
+ "grad_norm": 87.11450958251953,
7626
+ "learning_rate": 9.981329169004132e-06,
7627
+ "loss": 10.7198,
7628
+ "step": 10770
7629
+ },
7630
+ {
7631
+ "epoch": 0.47841740587259585,
7632
+ "grad_norm": 80.5800552368164,
7633
+ "learning_rate": 9.981311833042205e-06,
7634
+ "loss": 11.1395,
7635
+ "step": 10780
7636
+ },
7637
+ {
7638
+ "epoch": 0.47886120680568733,
7639
+ "grad_norm": 90.17820739746094,
7640
+ "learning_rate": 9.981294497080278e-06,
7641
+ "loss": 11.3689,
7642
+ "step": 10790
7643
+ },
7644
+ {
7645
+ "epoch": 0.47930500773877877,
7646
+ "grad_norm": 84.49495697021484,
7647
+ "learning_rate": 9.981277161118351e-06,
7648
+ "loss": 10.9592,
7649
+ "step": 10800
7650
+ },
7651
+ {
7652
+ "epoch": 0.47974880867187025,
7653
+ "grad_norm": 94.85502624511719,
7654
+ "learning_rate": 9.981259825156422e-06,
7655
+ "loss": 10.6913,
7656
+ "step": 10810
7657
+ },
7658
+ {
7659
+ "epoch": 0.4801926096049617,
7660
+ "grad_norm": 79.14407348632812,
7661
+ "learning_rate": 9.981242489194495e-06,
7662
+ "loss": 11.8454,
7663
+ "step": 10820
7664
+ },
7665
+ {
7666
+ "epoch": 0.48063641053805317,
7667
+ "grad_norm": 91.49024200439453,
7668
+ "learning_rate": 9.981225153232568e-06,
7669
+ "loss": 11.292,
7670
+ "step": 10830
7671
+ },
7672
+ {
7673
+ "epoch": 0.4810802114711446,
7674
+ "grad_norm": 97.75911712646484,
7675
+ "learning_rate": 9.98120781727064e-06,
7676
+ "loss": 11.3453,
7677
+ "step": 10840
7678
+ },
7679
+ {
7680
+ "epoch": 0.4815240124042361,
7681
+ "grad_norm": 76.58828735351562,
7682
+ "learning_rate": 9.981190481308713e-06,
7683
+ "loss": 11.1765,
7684
+ "step": 10850
7685
+ },
7686
+ {
7687
+ "epoch": 0.4819678133373275,
7688
+ "grad_norm": 79.92506408691406,
7689
+ "learning_rate": 9.981173145346786e-06,
7690
+ "loss": 10.868,
7691
+ "step": 10860
7692
+ },
7693
+ {
7694
+ "epoch": 0.482411614270419,
7695
+ "grad_norm": 101.02202606201172,
7696
+ "learning_rate": 9.981155809384857e-06,
7697
+ "loss": 11.0264,
7698
+ "step": 10870
7699
+ },
7700
+ {
7701
+ "epoch": 0.48285541520351044,
7702
+ "grad_norm": 81.69430541992188,
7703
+ "learning_rate": 9.98113847342293e-06,
7704
+ "loss": 11.2259,
7705
+ "step": 10880
7706
+ },
7707
+ {
7708
+ "epoch": 0.48329921613660193,
7709
+ "grad_norm": 86.8892822265625,
7710
+ "learning_rate": 9.981121137461004e-06,
7711
+ "loss": 11.0128,
7712
+ "step": 10890
7713
+ },
7714
+ {
7715
+ "epoch": 0.4837430170696934,
7716
+ "grad_norm": 72.10417938232422,
7717
+ "learning_rate": 9.981103801499075e-06,
7718
+ "loss": 11.3905,
7719
+ "step": 10900
7720
+ },
7721
+ {
7722
+ "epoch": 0.48418681800278485,
7723
+ "grad_norm": 85.03720092773438,
7724
+ "learning_rate": 9.981086465537148e-06,
7725
+ "loss": 11.1796,
7726
+ "step": 10910
7727
+ },
7728
+ {
7729
+ "epoch": 0.48463061893587633,
7730
+ "grad_norm": 88.04219818115234,
7731
+ "learning_rate": 9.981069129575221e-06,
7732
+ "loss": 11.2465,
7733
+ "step": 10920
7734
+ },
7735
+ {
7736
+ "epoch": 0.48507441986896777,
7737
+ "grad_norm": 73.39752960205078,
7738
+ "learning_rate": 9.981051793613294e-06,
7739
+ "loss": 10.9952,
7740
+ "step": 10930
7741
+ },
7742
+ {
7743
+ "epoch": 0.48551822080205925,
7744
+ "grad_norm": 91.22920227050781,
7745
+ "learning_rate": 9.981034457651366e-06,
7746
+ "loss": 10.9643,
7747
+ "step": 10940
7748
+ },
7749
+ {
7750
+ "epoch": 0.4859620217351507,
7751
+ "grad_norm": 67.8987045288086,
7752
+ "learning_rate": 9.981017121689439e-06,
7753
+ "loss": 11.1133,
7754
+ "step": 10950
7755
+ },
7756
+ {
7757
+ "epoch": 0.48640582266824217,
7758
+ "grad_norm": 78.0125732421875,
7759
+ "learning_rate": 9.980999785727512e-06,
7760
+ "loss": 11.5816,
7761
+ "step": 10960
7762
+ },
7763
+ {
7764
+ "epoch": 0.4868496236013336,
7765
+ "grad_norm": 90.03052520751953,
7766
+ "learning_rate": 9.980982449765583e-06,
7767
+ "loss": 11.2881,
7768
+ "step": 10970
7769
+ },
7770
+ {
7771
+ "epoch": 0.4872934245344251,
7772
+ "grad_norm": 90.72000122070312,
7773
+ "learning_rate": 9.980965113803656e-06,
7774
+ "loss": 11.1198,
7775
+ "step": 10980
7776
+ },
7777
+ {
7778
+ "epoch": 0.4877372254675165,
7779
+ "grad_norm": 72.88054656982422,
7780
+ "learning_rate": 9.98094777784173e-06,
7781
+ "loss": 11.3579,
7782
+ "step": 10990
7783
+ },
7784
+ {
7785
+ "epoch": 0.488181026400608,
7786
+ "grad_norm": 84.08674621582031,
7787
+ "learning_rate": 9.9809304418798e-06,
7788
+ "loss": 10.871,
7789
+ "step": 11000
7790
+ },
7791
+ {
7792
+ "epoch": 0.488181026400608,
7793
+ "eval_loss": 0.34528353810310364,
7794
+ "eval_runtime": 674.0977,
7795
+ "eval_samples_per_second": 1801.506,
7796
+ "eval_steps_per_second": 56.297,
7797
+ "step": 11000
7798
+ },
7799
+ {
7800
+ "epoch": 0.4886248273336995,
7801
+ "grad_norm": 74.27079010009766,
7802
+ "learning_rate": 9.980913105917874e-06,
7803
+ "loss": 11.5738,
7804
+ "step": 11010
7805
+ },
7806
+ {
7807
+ "epoch": 0.48906862826679093,
7808
+ "grad_norm": 79.84703063964844,
7809
+ "learning_rate": 9.980895769955947e-06,
7810
+ "loss": 10.8113,
7811
+ "step": 11020
7812
+ },
7813
+ {
7814
+ "epoch": 0.4895124291998824,
7815
+ "grad_norm": 78.84832000732422,
7816
+ "learning_rate": 9.980878433994018e-06,
7817
+ "loss": 11.1961,
7818
+ "step": 11030
7819
+ },
7820
+ {
7821
+ "epoch": 0.48995623013297385,
7822
+ "grad_norm": 77.97138977050781,
7823
+ "learning_rate": 9.980861098032091e-06,
7824
+ "loss": 10.747,
7825
+ "step": 11040
7826
+ },
7827
+ {
7828
+ "epoch": 0.49040003106606533,
7829
+ "grad_norm": 68.38355255126953,
7830
+ "learning_rate": 9.980843762070164e-06,
7831
+ "loss": 10.9581,
7832
+ "step": 11050
7833
+ },
7834
+ {
7835
+ "epoch": 0.49084383199915677,
7836
+ "grad_norm": 90.01298522949219,
7837
+ "learning_rate": 9.980826426108237e-06,
7838
+ "loss": 11.2473,
7839
+ "step": 11060
7840
+ },
7841
+ {
7842
+ "epoch": 0.49128763293224825,
7843
+ "grad_norm": 69.46513366699219,
7844
+ "learning_rate": 9.980809090146309e-06,
7845
+ "loss": 10.7865,
7846
+ "step": 11070
7847
+ },
7848
+ {
7849
+ "epoch": 0.4917314338653397,
7850
+ "grad_norm": 77.08251190185547,
7851
+ "learning_rate": 9.980791754184382e-06,
7852
+ "loss": 10.7613,
7853
+ "step": 11080
7854
+ },
7855
+ {
7856
+ "epoch": 0.49217523479843117,
7857
+ "grad_norm": 86.79558563232422,
7858
+ "learning_rate": 9.980774418222455e-06,
7859
+ "loss": 10.4478,
7860
+ "step": 11090
7861
+ },
7862
+ {
7863
+ "epoch": 0.4926190357315226,
7864
+ "grad_norm": 78.37332153320312,
7865
+ "learning_rate": 9.980757082260526e-06,
7866
+ "loss": 11.3574,
7867
+ "step": 11100
7868
+ },
7869
+ {
7870
+ "epoch": 0.4930628366646141,
7871
+ "grad_norm": 77.12301635742188,
7872
+ "learning_rate": 9.9807397462986e-06,
7873
+ "loss": 10.9563,
7874
+ "step": 11110
7875
+ },
7876
+ {
7877
+ "epoch": 0.4935066375977055,
7878
+ "grad_norm": 84.5125961303711,
7879
+ "learning_rate": 9.980722410336672e-06,
7880
+ "loss": 10.7285,
7881
+ "step": 11120
7882
+ },
7883
+ {
7884
+ "epoch": 0.493950438530797,
7885
+ "grad_norm": 72.76651763916016,
7886
+ "learning_rate": 9.980705074374744e-06,
7887
+ "loss": 10.8235,
7888
+ "step": 11130
7889
+ },
7890
+ {
7891
+ "epoch": 0.4943942394638885,
7892
+ "grad_norm": 65.90115356445312,
7893
+ "learning_rate": 9.980687738412817e-06,
7894
+ "loss": 11.2566,
7895
+ "step": 11140
7896
+ },
7897
+ {
7898
+ "epoch": 0.49483804039697993,
7899
+ "grad_norm": 93.98876190185547,
7900
+ "learning_rate": 9.98067040245089e-06,
7901
+ "loss": 11.1416,
7902
+ "step": 11150
7903
+ },
7904
+ {
7905
+ "epoch": 0.4952818413300714,
7906
+ "grad_norm": 84.59708404541016,
7907
+ "learning_rate": 9.980653066488961e-06,
7908
+ "loss": 10.9902,
7909
+ "step": 11160
7910
+ },
7911
+ {
7912
+ "epoch": 0.49572564226316285,
7913
+ "grad_norm": 82.25418090820312,
7914
+ "learning_rate": 9.980635730527034e-06,
7915
+ "loss": 10.9595,
7916
+ "step": 11170
7917
+ },
7918
+ {
7919
+ "epoch": 0.49616944319625433,
7920
+ "grad_norm": 77.34684753417969,
7921
+ "learning_rate": 9.980618394565108e-06,
7922
+ "loss": 11.4648,
7923
+ "step": 11180
7924
+ },
7925
+ {
7926
+ "epoch": 0.49661324412934577,
7927
+ "grad_norm": 93.93881225585938,
7928
+ "learning_rate": 9.98060105860318e-06,
7929
+ "loss": 10.8638,
7930
+ "step": 11190
7931
+ },
7932
+ {
7933
+ "epoch": 0.49705704506243725,
7934
+ "grad_norm": 70.26705932617188,
7935
+ "learning_rate": 9.980583722641252e-06,
7936
+ "loss": 10.9112,
7937
+ "step": 11200
7938
+ },
7939
+ {
7940
+ "epoch": 0.4975008459955287,
7941
+ "grad_norm": 82.66825103759766,
7942
+ "learning_rate": 9.980566386679325e-06,
7943
+ "loss": 11.5958,
7944
+ "step": 11210
7945
+ },
7946
+ {
7947
+ "epoch": 0.49794464692862017,
7948
+ "grad_norm": 86.82162475585938,
7949
+ "learning_rate": 9.980549050717398e-06,
7950
+ "loss": 10.8187,
7951
+ "step": 11220
7952
+ },
7953
+ {
7954
+ "epoch": 0.4983884478617116,
7955
+ "grad_norm": 79.9798355102539,
7956
+ "learning_rate": 9.98053171475547e-06,
7957
+ "loss": 11.2143,
7958
+ "step": 11230
7959
+ },
7960
+ {
7961
+ "epoch": 0.4988322487948031,
7962
+ "grad_norm": 99.23787689208984,
7963
+ "learning_rate": 9.980514378793543e-06,
7964
+ "loss": 10.9515,
7965
+ "step": 11240
7966
+ },
7967
+ {
7968
+ "epoch": 0.4992760497278946,
7969
+ "grad_norm": 70.37545013427734,
7970
+ "learning_rate": 9.980497042831616e-06,
7971
+ "loss": 11.0358,
7972
+ "step": 11250
7973
+ },
7974
+ {
7975
+ "epoch": 0.499719850660986,
7976
+ "grad_norm": 91.39663696289062,
7977
+ "learning_rate": 9.980479706869687e-06,
7978
+ "loss": 10.7406,
7979
+ "step": 11260
7980
+ },
7981
+ {
7982
+ "epoch": 0.5001636515940775,
7983
+ "grad_norm": 81.30348205566406,
7984
+ "learning_rate": 9.98046237090776e-06,
7985
+ "loss": 11.2646,
7986
+ "step": 11270
7987
+ },
7988
+ {
7989
+ "epoch": 0.500607452527169,
7990
+ "grad_norm": 83.41373443603516,
7991
+ "learning_rate": 9.980445034945833e-06,
7992
+ "loss": 10.5715,
7993
+ "step": 11280
7994
+ },
7995
+ {
7996
+ "epoch": 0.5010512534602604,
7997
+ "grad_norm": 84.33602142333984,
7998
+ "learning_rate": 9.980427698983906e-06,
7999
+ "loss": 10.9119,
8000
+ "step": 11290
8001
+ },
8002
+ {
8003
+ "epoch": 0.5014950543933518,
8004
+ "grad_norm": 76.0499038696289,
8005
+ "learning_rate": 9.980410363021978e-06,
8006
+ "loss": 11.1319,
8007
+ "step": 11300
8008
+ },
8009
+ {
8010
+ "epoch": 0.5019388553264433,
8011
+ "grad_norm": 87.45086669921875,
8012
+ "learning_rate": 9.98039302706005e-06,
8013
+ "loss": 10.3676,
8014
+ "step": 11310
8015
+ },
8016
+ {
8017
+ "epoch": 0.5023826562595348,
8018
+ "grad_norm": 88.60616302490234,
8019
+ "learning_rate": 9.980375691098124e-06,
8020
+ "loss": 11.0152,
8021
+ "step": 11320
8022
+ },
8023
+ {
8024
+ "epoch": 0.5028264571926262,
8025
+ "grad_norm": 91.6775894165039,
8026
+ "learning_rate": 9.980358355136195e-06,
8027
+ "loss": 10.8913,
8028
+ "step": 11330
8029
+ },
8030
+ {
8031
+ "epoch": 0.5032702581257177,
8032
+ "grad_norm": 83.63784790039062,
8033
+ "learning_rate": 9.980341019174268e-06,
8034
+ "loss": 10.4357,
8035
+ "step": 11340
8036
+ },
8037
+ {
8038
+ "epoch": 0.5037140590588092,
8039
+ "grad_norm": 93.07415008544922,
8040
+ "learning_rate": 9.980323683212341e-06,
8041
+ "loss": 10.3792,
8042
+ "step": 11350
8043
+ },
8044
+ {
8045
+ "epoch": 0.5041578599919007,
8046
+ "grad_norm": 88.19844818115234,
8047
+ "learning_rate": 9.980306347250413e-06,
8048
+ "loss": 10.886,
8049
+ "step": 11360
8050
+ },
8051
+ {
8052
+ "epoch": 0.5046016609249921,
8053
+ "grad_norm": 79.55142211914062,
8054
+ "learning_rate": 9.980289011288486e-06,
8055
+ "loss": 10.8876,
8056
+ "step": 11370
8057
+ },
8058
+ {
8059
+ "epoch": 0.5050454618580835,
8060
+ "grad_norm": 85.38150787353516,
8061
+ "learning_rate": 9.980271675326559e-06,
8062
+ "loss": 11.1223,
8063
+ "step": 11380
8064
+ },
8065
+ {
8066
+ "epoch": 0.505489262791175,
8067
+ "grad_norm": 69.45784759521484,
8068
+ "learning_rate": 9.98025433936463e-06,
8069
+ "loss": 10.688,
8070
+ "step": 11390
8071
+ },
8072
+ {
8073
+ "epoch": 0.5059330637242665,
8074
+ "grad_norm": 83.27287292480469,
8075
+ "learning_rate": 9.980237003402703e-06,
8076
+ "loss": 11.482,
8077
+ "step": 11400
8078
+ },
8079
+ {
8080
+ "epoch": 0.506376864657358,
8081
+ "grad_norm": 87.00122833251953,
8082
+ "learning_rate": 9.980219667440777e-06,
8083
+ "loss": 11.2689,
8084
+ "step": 11410
8085
+ },
8086
+ {
8087
+ "epoch": 0.5068206655904494,
8088
+ "grad_norm": 78.08293914794922,
8089
+ "learning_rate": 9.98020233147885e-06,
8090
+ "loss": 10.4533,
8091
+ "step": 11420
8092
+ },
8093
+ {
8094
+ "epoch": 0.5072644665235408,
8095
+ "grad_norm": 71.37157440185547,
8096
+ "learning_rate": 9.980184995516921e-06,
8097
+ "loss": 10.856,
8098
+ "step": 11430
8099
+ },
8100
+ {
8101
+ "epoch": 0.5077082674566323,
8102
+ "grad_norm": 78.55634307861328,
8103
+ "learning_rate": 9.980167659554994e-06,
8104
+ "loss": 11.0349,
8105
+ "step": 11440
8106
+ },
8107
+ {
8108
+ "epoch": 0.5081520683897238,
8109
+ "grad_norm": 83.2050552368164,
8110
+ "learning_rate": 9.980150323593067e-06,
8111
+ "loss": 11.0373,
8112
+ "step": 11450
8113
+ },
8114
+ {
8115
+ "epoch": 0.5085958693228152,
8116
+ "grad_norm": 94.2475357055664,
8117
+ "learning_rate": 9.980132987631139e-06,
8118
+ "loss": 10.5705,
8119
+ "step": 11460
8120
+ },
8121
+ {
8122
+ "epoch": 0.5090396702559067,
8123
+ "grad_norm": 75.6529769897461,
8124
+ "learning_rate": 9.980115651669212e-06,
8125
+ "loss": 10.8084,
8126
+ "step": 11470
8127
+ },
8128
+ {
8129
+ "epoch": 0.5094834711889982,
8130
+ "grad_norm": 68.75302124023438,
8131
+ "learning_rate": 9.980098315707285e-06,
8132
+ "loss": 10.4888,
8133
+ "step": 11480
8134
+ },
8135
+ {
8136
+ "epoch": 0.5099272721220897,
8137
+ "grad_norm": 88.40451049804688,
8138
+ "learning_rate": 9.980080979745356e-06,
8139
+ "loss": 11.4719,
8140
+ "step": 11490
8141
+ },
8142
+ {
8143
+ "epoch": 0.5103710730551811,
8144
+ "grad_norm": 85.7581558227539,
8145
+ "learning_rate": 9.980063643783429e-06,
8146
+ "loss": 11.1014,
8147
+ "step": 11500
8148
+ },
8149
+ {
8150
+ "epoch": 0.5108148739882725,
8151
+ "grad_norm": 79.09970092773438,
8152
+ "learning_rate": 9.980046307821502e-06,
8153
+ "loss": 11.0976,
8154
+ "step": 11510
8155
+ },
8156
+ {
8157
+ "epoch": 0.511258674921364,
8158
+ "grad_norm": 83.1121597290039,
8159
+ "learning_rate": 9.980028971859574e-06,
8160
+ "loss": 10.8814,
8161
+ "step": 11520
8162
+ },
8163
+ {
8164
+ "epoch": 0.5117024758544555,
8165
+ "grad_norm": 77.43012237548828,
8166
+ "learning_rate": 9.980011635897647e-06,
8167
+ "loss": 10.775,
8168
+ "step": 11530
8169
+ },
8170
+ {
8171
+ "epoch": 0.512146276787547,
8172
+ "grad_norm": 68.88916778564453,
8173
+ "learning_rate": 9.97999429993572e-06,
8174
+ "loss": 11.1833,
8175
+ "step": 11540
8176
+ },
8177
+ {
8178
+ "epoch": 0.5125900777206384,
8179
+ "grad_norm": 70.91609954833984,
8180
+ "learning_rate": 9.979976963973793e-06,
8181
+ "loss": 10.6464,
8182
+ "step": 11550
8183
+ },
8184
+ {
8185
+ "epoch": 0.5130338786537298,
8186
+ "grad_norm": 83.58845520019531,
8187
+ "learning_rate": 9.979959628011864e-06,
8188
+ "loss": 10.9326,
8189
+ "step": 11560
8190
+ },
8191
+ {
8192
+ "epoch": 0.5134776795868213,
8193
+ "grad_norm": 81.35511016845703,
8194
+ "learning_rate": 9.979942292049937e-06,
8195
+ "loss": 11.163,
8196
+ "step": 11570
8197
+ },
8198
+ {
8199
+ "epoch": 0.5139214805199128,
8200
+ "grad_norm": 77.6694564819336,
8201
+ "learning_rate": 9.97992495608801e-06,
8202
+ "loss": 10.8365,
8203
+ "step": 11580
8204
+ },
8205
+ {
8206
+ "epoch": 0.5143652814530043,
8207
+ "grad_norm": 87.32294464111328,
8208
+ "learning_rate": 9.979907620126082e-06,
8209
+ "loss": 10.8808,
8210
+ "step": 11590
8211
+ },
8212
+ {
8213
+ "epoch": 0.5148090823860957,
8214
+ "grad_norm": 79.58880615234375,
8215
+ "learning_rate": 9.979890284164155e-06,
8216
+ "loss": 10.7793,
8217
+ "step": 11600
8218
+ },
8219
+ {
8220
+ "epoch": 0.5152528833191872,
8221
+ "grad_norm": 70.23893737792969,
8222
+ "learning_rate": 9.979872948202228e-06,
8223
+ "loss": 10.9845,
8224
+ "step": 11610
8225
+ },
8226
+ {
8227
+ "epoch": 0.5156966842522787,
8228
+ "grad_norm": 72.99483489990234,
8229
+ "learning_rate": 9.9798556122403e-06,
8230
+ "loss": 10.8218,
8231
+ "step": 11620
8232
+ },
8233
+ {
8234
+ "epoch": 0.5161404851853701,
8235
+ "grad_norm": 96.56175994873047,
8236
+ "learning_rate": 9.979838276278372e-06,
8237
+ "loss": 11.0247,
8238
+ "step": 11630
8239
+ },
8240
+ {
8241
+ "epoch": 0.5165842861184615,
8242
+ "grad_norm": 76.31138610839844,
8243
+ "learning_rate": 9.979820940316445e-06,
8244
+ "loss": 11.0729,
8245
+ "step": 11640
8246
+ },
8247
+ {
8248
+ "epoch": 0.517028087051553,
8249
+ "grad_norm": 77.18230438232422,
8250
+ "learning_rate": 9.979803604354517e-06,
8251
+ "loss": 10.3969,
8252
+ "step": 11650
8253
+ },
8254
+ {
8255
+ "epoch": 0.5174718879846445,
8256
+ "grad_norm": 90.09929656982422,
8257
+ "learning_rate": 9.97978626839259e-06,
8258
+ "loss": 11.3239,
8259
+ "step": 11660
8260
+ },
8261
+ {
8262
+ "epoch": 0.517915688917736,
8263
+ "grad_norm": 69.90628051757812,
8264
+ "learning_rate": 9.979768932430663e-06,
8265
+ "loss": 10.7297,
8266
+ "step": 11670
8267
+ },
8268
+ {
8269
+ "epoch": 0.5183594898508274,
8270
+ "grad_norm": 90.91302490234375,
8271
+ "learning_rate": 9.979751596468736e-06,
8272
+ "loss": 11.3096,
8273
+ "step": 11680
8274
+ },
8275
+ {
8276
+ "epoch": 0.5188032907839188,
8277
+ "grad_norm": 81.82889556884766,
8278
+ "learning_rate": 9.979734260506807e-06,
8279
+ "loss": 10.9807,
8280
+ "step": 11690
8281
+ },
8282
+ {
8283
+ "epoch": 0.5192470917170103,
8284
+ "grad_norm": 86.98174285888672,
8285
+ "learning_rate": 9.97971692454488e-06,
8286
+ "loss": 11.2847,
8287
+ "step": 11700
8288
+ },
8289
+ {
8290
+ "epoch": 0.5196908926501018,
8291
+ "grad_norm": 72.4999771118164,
8292
+ "learning_rate": 9.979699588582954e-06,
8293
+ "loss": 10.7933,
8294
+ "step": 11710
8295
+ },
8296
+ {
8297
+ "epoch": 0.5201346935831933,
8298
+ "grad_norm": 82.73175048828125,
8299
+ "learning_rate": 9.979682252621025e-06,
8300
+ "loss": 10.8406,
8301
+ "step": 11720
8302
+ },
8303
+ {
8304
+ "epoch": 0.5205784945162847,
8305
+ "grad_norm": 68.61962127685547,
8306
+ "learning_rate": 9.979664916659098e-06,
8307
+ "loss": 10.7406,
8308
+ "step": 11730
8309
+ },
8310
+ {
8311
+ "epoch": 0.5210222954493762,
8312
+ "grad_norm": 80.85415649414062,
8313
+ "learning_rate": 9.979647580697171e-06,
8314
+ "loss": 11.0003,
8315
+ "step": 11740
8316
+ },
8317
+ {
8318
+ "epoch": 0.5214660963824677,
8319
+ "grad_norm": 81.90907287597656,
8320
+ "learning_rate": 9.979630244735243e-06,
8321
+ "loss": 11.4761,
8322
+ "step": 11750
8323
+ },
8324
+ {
8325
+ "epoch": 0.5219098973155591,
8326
+ "grad_norm": 70.7921142578125,
8327
+ "learning_rate": 9.979612908773316e-06,
8328
+ "loss": 10.7498,
8329
+ "step": 11760
8330
+ },
8331
+ {
8332
+ "epoch": 0.5223536982486505,
8333
+ "grad_norm": 74.50489044189453,
8334
+ "learning_rate": 9.979595572811389e-06,
8335
+ "loss": 11.7802,
8336
+ "step": 11770
8337
+ },
8338
+ {
8339
+ "epoch": 0.522797499181742,
8340
+ "grad_norm": 75.52880859375,
8341
+ "learning_rate": 9.97957823684946e-06,
8342
+ "loss": 10.7195,
8343
+ "step": 11780
8344
+ },
8345
+ {
8346
+ "epoch": 0.5232413001148335,
8347
+ "grad_norm": 80.11971282958984,
8348
+ "learning_rate": 9.979560900887533e-06,
8349
+ "loss": 10.747,
8350
+ "step": 11790
8351
+ },
8352
+ {
8353
+ "epoch": 0.523685101047925,
8354
+ "grad_norm": 84.3338851928711,
8355
+ "learning_rate": 9.979543564925606e-06,
8356
+ "loss": 11.132,
8357
+ "step": 11800
8358
+ },
8359
+ {
8360
+ "epoch": 0.5241289019810164,
8361
+ "grad_norm": 80.16667938232422,
8362
+ "learning_rate": 9.979526228963678e-06,
8363
+ "loss": 11.2093,
8364
+ "step": 11810
8365
+ },
8366
+ {
8367
+ "epoch": 0.5245727029141078,
8368
+ "grad_norm": 70.78595733642578,
8369
+ "learning_rate": 9.97950889300175e-06,
8370
+ "loss": 10.7076,
8371
+ "step": 11820
8372
+ },
8373
+ {
8374
+ "epoch": 0.5250165038471993,
8375
+ "grad_norm": 75.8795394897461,
8376
+ "learning_rate": 9.979491557039824e-06,
8377
+ "loss": 10.7308,
8378
+ "step": 11830
8379
+ },
8380
+ {
8381
+ "epoch": 0.5254603047802908,
8382
+ "grad_norm": 87.91172790527344,
8383
+ "learning_rate": 9.979474221077895e-06,
8384
+ "loss": 10.3702,
8385
+ "step": 11840
8386
+ },
8387
+ {
8388
+ "epoch": 0.5259041057133823,
8389
+ "grad_norm": 86.1435317993164,
8390
+ "learning_rate": 9.979456885115968e-06,
8391
+ "loss": 10.6837,
8392
+ "step": 11850
8393
+ },
8394
+ {
8395
+ "epoch": 0.5263479066464737,
8396
+ "grad_norm": 74.28446960449219,
8397
+ "learning_rate": 9.979439549154041e-06,
8398
+ "loss": 11.3425,
8399
+ "step": 11860
8400
+ },
8401
+ {
8402
+ "epoch": 0.5267917075795652,
8403
+ "grad_norm": 77.42523956298828,
8404
+ "learning_rate": 9.979422213192113e-06,
8405
+ "loss": 10.7451,
8406
+ "step": 11870
8407
+ },
8408
+ {
8409
+ "epoch": 0.5272355085126567,
8410
+ "grad_norm": 71.55403900146484,
8411
+ "learning_rate": 9.979404877230186e-06,
8412
+ "loss": 11.2173,
8413
+ "step": 11880
8414
+ },
8415
+ {
8416
+ "epoch": 0.5276793094457481,
8417
+ "grad_norm": 79.82381439208984,
8418
+ "learning_rate": 9.979387541268259e-06,
8419
+ "loss": 10.9905,
8420
+ "step": 11890
8421
+ },
8422
+ {
8423
+ "epoch": 0.5281231103788395,
8424
+ "grad_norm": 90.72064208984375,
8425
+ "learning_rate": 9.979370205306332e-06,
8426
+ "loss": 10.8288,
8427
+ "step": 11900
8428
+ },
8429
+ {
8430
+ "epoch": 0.528566911311931,
8431
+ "grad_norm": 68.74069213867188,
8432
+ "learning_rate": 9.979352869344403e-06,
8433
+ "loss": 11.3552,
8434
+ "step": 11910
8435
+ },
8436
+ {
8437
+ "epoch": 0.5290107122450225,
8438
+ "grad_norm": 79.70250701904297,
8439
+ "learning_rate": 9.979335533382476e-06,
8440
+ "loss": 11.0605,
8441
+ "step": 11920
8442
+ },
8443
+ {
8444
+ "epoch": 0.529454513178114,
8445
+ "grad_norm": 77.91078186035156,
8446
+ "learning_rate": 9.97931819742055e-06,
8447
+ "loss": 10.8511,
8448
+ "step": 11930
8449
+ },
8450
+ {
8451
+ "epoch": 0.5298983141112055,
8452
+ "grad_norm": 75.19290924072266,
8453
+ "learning_rate": 9.979300861458621e-06,
8454
+ "loss": 11.1562,
8455
+ "step": 11940
8456
+ },
8457
+ {
8458
+ "epoch": 0.5303421150442968,
8459
+ "grad_norm": 86.28694152832031,
8460
+ "learning_rate": 9.979283525496694e-06,
8461
+ "loss": 10.9518,
8462
+ "step": 11950
8463
+ },
8464
+ {
8465
+ "epoch": 0.5307859159773883,
8466
+ "grad_norm": 83.85566711425781,
8467
+ "learning_rate": 9.979266189534767e-06,
8468
+ "loss": 10.5831,
8469
+ "step": 11960
8470
+ },
8471
+ {
8472
+ "epoch": 0.5312297169104798,
8473
+ "grad_norm": 89.48451232910156,
8474
+ "learning_rate": 9.979248853572838e-06,
8475
+ "loss": 11.1542,
8476
+ "step": 11970
8477
+ },
8478
+ {
8479
+ "epoch": 0.5316735178435713,
8480
+ "grad_norm": 74.22183227539062,
8481
+ "learning_rate": 9.979231517610911e-06,
8482
+ "loss": 11.742,
8483
+ "step": 11980
8484
+ },
8485
+ {
8486
+ "epoch": 0.5321173187766627,
8487
+ "grad_norm": 74.32493591308594,
8488
+ "learning_rate": 9.979214181648985e-06,
8489
+ "loss": 11.5062,
8490
+ "step": 11990
8491
+ },
8492
+ {
8493
+ "epoch": 0.5325611197097542,
8494
+ "grad_norm": 84.4752197265625,
8495
+ "learning_rate": 9.979196845687056e-06,
8496
+ "loss": 11.24,
8497
+ "step": 12000
8498
+ },
8499
+ {
8500
+ "epoch": 0.5325611197097542,
8501
+ "eval_loss": 0.3403577506542206,
8502
+ "eval_runtime": 674.8568,
8503
+ "eval_samples_per_second": 1799.479,
8504
+ "eval_steps_per_second": 56.234,
8505
+ "step": 12000
8506
+ },
8507
+ {
8508
+ "epoch": 0.5330049206428457,
8509
+ "grad_norm": 70.7219467163086,
8510
+ "learning_rate": 9.979179509725129e-06,
8511
+ "loss": 10.7596,
8512
+ "step": 12010
8513
+ },
8514
+ {
8515
+ "epoch": 0.5334487215759371,
8516
+ "grad_norm": 81.48099517822266,
8517
+ "learning_rate": 9.979162173763202e-06,
8518
+ "loss": 10.9401,
8519
+ "step": 12020
8520
+ },
8521
+ {
8522
+ "epoch": 0.5338925225090285,
8523
+ "grad_norm": 90.189208984375,
8524
+ "learning_rate": 9.979144837801273e-06,
8525
+ "loss": 11.0275,
8526
+ "step": 12030
8527
+ },
8528
+ {
8529
+ "epoch": 0.53433632344212,
8530
+ "grad_norm": 77.25494384765625,
8531
+ "learning_rate": 9.979127501839347e-06,
8532
+ "loss": 10.5871,
8533
+ "step": 12040
8534
+ },
8535
+ {
8536
+ "epoch": 0.5347801243752115,
8537
+ "grad_norm": 83.02628326416016,
8538
+ "learning_rate": 9.97911016587742e-06,
8539
+ "loss": 11.6188,
8540
+ "step": 12050
8541
+ },
8542
+ {
8543
+ "epoch": 0.535223925308303,
8544
+ "grad_norm": 73.51893615722656,
8545
+ "learning_rate": 9.979092829915491e-06,
8546
+ "loss": 11.0726,
8547
+ "step": 12060
8548
+ },
8549
+ {
8550
+ "epoch": 0.5356677262413945,
8551
+ "grad_norm": 84.49578857421875,
8552
+ "learning_rate": 9.979075493953564e-06,
8553
+ "loss": 11.4908,
8554
+ "step": 12070
8555
+ },
8556
+ {
8557
+ "epoch": 0.5361115271744858,
8558
+ "grad_norm": 71.82061767578125,
8559
+ "learning_rate": 9.979058157991637e-06,
8560
+ "loss": 10.8268,
8561
+ "step": 12080
8562
+ },
8563
+ {
8564
+ "epoch": 0.5365553281075773,
8565
+ "grad_norm": 79.56192016601562,
8566
+ "learning_rate": 9.979040822029709e-06,
8567
+ "loss": 10.6541,
8568
+ "step": 12090
8569
+ },
8570
+ {
8571
+ "epoch": 0.5369991290406688,
8572
+ "grad_norm": 92.98332977294922,
8573
+ "learning_rate": 9.979023486067782e-06,
8574
+ "loss": 10.8328,
8575
+ "step": 12100
8576
+ },
8577
+ {
8578
+ "epoch": 0.5374429299737603,
8579
+ "grad_norm": 78.83235931396484,
8580
+ "learning_rate": 9.979006150105855e-06,
8581
+ "loss": 10.7446,
8582
+ "step": 12110
8583
+ },
8584
+ {
8585
+ "epoch": 0.5378867309068517,
8586
+ "grad_norm": 85.79434204101562,
8587
+ "learning_rate": 9.978988814143928e-06,
8588
+ "loss": 11.2323,
8589
+ "step": 12120
8590
+ },
8591
+ {
8592
+ "epoch": 0.5383305318399432,
8593
+ "grad_norm": 74.76625061035156,
8594
+ "learning_rate": 9.978971478182e-06,
8595
+ "loss": 11.2963,
8596
+ "step": 12130
8597
+ },
8598
+ {
8599
+ "epoch": 0.5387743327730347,
8600
+ "grad_norm": 75.79556274414062,
8601
+ "learning_rate": 9.978954142220072e-06,
8602
+ "loss": 11.4193,
8603
+ "step": 12140
8604
+ },
8605
+ {
8606
+ "epoch": 0.5392181337061261,
8607
+ "grad_norm": 71.63494110107422,
8608
+ "learning_rate": 9.978936806258145e-06,
8609
+ "loss": 10.7985,
8610
+ "step": 12150
8611
+ },
8612
+ {
8613
+ "epoch": 0.5396619346392176,
8614
+ "grad_norm": 85.59138488769531,
8615
+ "learning_rate": 9.978919470296217e-06,
8616
+ "loss": 11.4485,
8617
+ "step": 12160
8618
+ },
8619
+ {
8620
+ "epoch": 0.540105735572309,
8621
+ "grad_norm": 76.133544921875,
8622
+ "learning_rate": 9.97890213433429e-06,
8623
+ "loss": 11.4676,
8624
+ "step": 12170
8625
+ },
8626
+ {
8627
+ "epoch": 0.5405495365054005,
8628
+ "grad_norm": 74.8232650756836,
8629
+ "learning_rate": 9.978884798372363e-06,
8630
+ "loss": 10.934,
8631
+ "step": 12180
8632
+ },
8633
+ {
8634
+ "epoch": 0.540993337438492,
8635
+ "grad_norm": 82.42066955566406,
8636
+ "learning_rate": 9.978867462410434e-06,
8637
+ "loss": 11.0111,
8638
+ "step": 12190
8639
+ },
8640
+ {
8641
+ "epoch": 0.5414371383715835,
8642
+ "grad_norm": 80.84729766845703,
8643
+ "learning_rate": 9.978850126448507e-06,
8644
+ "loss": 10.4483,
8645
+ "step": 12200
8646
+ },
8647
+ {
8648
+ "epoch": 0.5418809393046748,
8649
+ "grad_norm": 83.81796264648438,
8650
+ "learning_rate": 9.97883279048658e-06,
8651
+ "loss": 10.8341,
8652
+ "step": 12210
8653
+ },
8654
+ {
8655
+ "epoch": 0.5423247402377663,
8656
+ "grad_norm": 73.58375549316406,
8657
+ "learning_rate": 9.978815454524652e-06,
8658
+ "loss": 10.5816,
8659
+ "step": 12220
8660
+ },
8661
+ {
8662
+ "epoch": 0.5427685411708578,
8663
+ "grad_norm": 94.62816619873047,
8664
+ "learning_rate": 9.978798118562725e-06,
8665
+ "loss": 11.0264,
8666
+ "step": 12230
8667
+ },
8668
+ {
8669
+ "epoch": 0.5432123421039493,
8670
+ "grad_norm": 80.29084014892578,
8671
+ "learning_rate": 9.978780782600798e-06,
8672
+ "loss": 11.1186,
8673
+ "step": 12240
8674
+ },
8675
+ {
8676
+ "epoch": 0.5436561430370407,
8677
+ "grad_norm": 86.72004699707031,
8678
+ "learning_rate": 9.97876344663887e-06,
8679
+ "loss": 10.6926,
8680
+ "step": 12250
8681
+ },
8682
+ {
8683
+ "epoch": 0.5440999439701322,
8684
+ "grad_norm": 78.45811462402344,
8685
+ "learning_rate": 9.978746110676942e-06,
8686
+ "loss": 10.6757,
8687
+ "step": 12260
8688
+ },
8689
+ {
8690
+ "epoch": 0.5445437449032237,
8691
+ "grad_norm": 81.1561050415039,
8692
+ "learning_rate": 9.978728774715015e-06,
8693
+ "loss": 10.9724,
8694
+ "step": 12270
8695
+ },
8696
+ {
8697
+ "epoch": 0.5449875458363151,
8698
+ "grad_norm": 81.57537841796875,
8699
+ "learning_rate": 9.978711438753087e-06,
8700
+ "loss": 10.9973,
8701
+ "step": 12280
8702
+ },
8703
+ {
8704
+ "epoch": 0.5454313467694066,
8705
+ "grad_norm": 71.61268615722656,
8706
+ "learning_rate": 9.97869410279116e-06,
8707
+ "loss": 10.961,
8708
+ "step": 12290
8709
+ },
8710
+ {
8711
+ "epoch": 0.545875147702498,
8712
+ "grad_norm": 69.37332153320312,
8713
+ "learning_rate": 9.978676766829233e-06,
8714
+ "loss": 10.6944,
8715
+ "step": 12300
8716
+ },
8717
+ {
8718
+ "epoch": 0.5463189486355895,
8719
+ "grad_norm": 75.34327697753906,
8720
+ "learning_rate": 9.978659430867304e-06,
8721
+ "loss": 10.9383,
8722
+ "step": 12310
8723
+ },
8724
+ {
8725
+ "epoch": 0.546762749568681,
8726
+ "grad_norm": 81.64777374267578,
8727
+ "learning_rate": 9.978642094905377e-06,
8728
+ "loss": 10.8668,
8729
+ "step": 12320
8730
+ },
8731
+ {
8732
+ "epoch": 0.5472065505017725,
8733
+ "grad_norm": 77.13945770263672,
8734
+ "learning_rate": 9.97862475894345e-06,
8735
+ "loss": 10.5414,
8736
+ "step": 12330
8737
+ },
8738
+ {
8739
+ "epoch": 0.5476503514348638,
8740
+ "grad_norm": 68.74790954589844,
8741
+ "learning_rate": 9.978607422981524e-06,
8742
+ "loss": 10.5574,
8743
+ "step": 12340
8744
+ },
8745
+ {
8746
+ "epoch": 0.5480941523679553,
8747
+ "grad_norm": 73.5103988647461,
8748
+ "learning_rate": 9.978590087019595e-06,
8749
+ "loss": 10.6581,
8750
+ "step": 12350
8751
+ },
8752
+ {
8753
+ "epoch": 0.5485379533010468,
8754
+ "grad_norm": 74.06155395507812,
8755
+ "learning_rate": 9.978572751057668e-06,
8756
+ "loss": 10.9724,
8757
+ "step": 12360
8758
+ },
8759
+ {
8760
+ "epoch": 0.5489817542341383,
8761
+ "grad_norm": 83.41978454589844,
8762
+ "learning_rate": 9.978555415095741e-06,
8763
+ "loss": 10.8189,
8764
+ "step": 12370
8765
+ },
8766
+ {
8767
+ "epoch": 0.5494255551672297,
8768
+ "grad_norm": 75.58580780029297,
8769
+ "learning_rate": 9.978538079133813e-06,
8770
+ "loss": 10.9859,
8771
+ "step": 12380
8772
+ },
8773
+ {
8774
+ "epoch": 0.5498693561003212,
8775
+ "grad_norm": 88.49089050292969,
8776
+ "learning_rate": 9.978520743171886e-06,
8777
+ "loss": 10.7513,
8778
+ "step": 12390
8779
+ },
8780
+ {
8781
+ "epoch": 0.5503131570334127,
8782
+ "grad_norm": 58.82784652709961,
8783
+ "learning_rate": 9.978503407209959e-06,
8784
+ "loss": 10.5162,
8785
+ "step": 12400
8786
+ },
8787
+ {
8788
+ "epoch": 0.5507569579665041,
8789
+ "grad_norm": 83.45179748535156,
8790
+ "learning_rate": 9.97848607124803e-06,
8791
+ "loss": 10.8963,
8792
+ "step": 12410
8793
+ },
8794
+ {
8795
+ "epoch": 0.5512007588995956,
8796
+ "grad_norm": 78.52918243408203,
8797
+ "learning_rate": 9.978468735286103e-06,
8798
+ "loss": 11.3171,
8799
+ "step": 12420
8800
+ },
8801
+ {
8802
+ "epoch": 0.551644559832687,
8803
+ "grad_norm": 92.01652526855469,
8804
+ "learning_rate": 9.978451399324176e-06,
8805
+ "loss": 10.9977,
8806
+ "step": 12430
8807
+ },
8808
+ {
8809
+ "epoch": 0.5520883607657785,
8810
+ "grad_norm": 81.95563507080078,
8811
+ "learning_rate": 9.978434063362248e-06,
8812
+ "loss": 10.8333,
8813
+ "step": 12440
8814
+ },
8815
+ {
8816
+ "epoch": 0.55253216169887,
8817
+ "grad_norm": 77.15538024902344,
8818
+ "learning_rate": 9.97841672740032e-06,
8819
+ "loss": 11.1091,
8820
+ "step": 12450
8821
+ },
8822
+ {
8823
+ "epoch": 0.5529759626319615,
8824
+ "grad_norm": 77.65177917480469,
8825
+ "learning_rate": 9.978399391438394e-06,
8826
+ "loss": 10.9557,
8827
+ "step": 12460
8828
+ },
8829
+ {
8830
+ "epoch": 0.5534197635650528,
8831
+ "grad_norm": 78.06676483154297,
8832
+ "learning_rate": 9.978382055476465e-06,
8833
+ "loss": 10.6794,
8834
+ "step": 12470
8835
+ },
8836
+ {
8837
+ "epoch": 0.5538635644981443,
8838
+ "grad_norm": 74.32503509521484,
8839
+ "learning_rate": 9.978364719514538e-06,
8840
+ "loss": 11.1013,
8841
+ "step": 12480
8842
+ },
8843
+ {
8844
+ "epoch": 0.5543073654312358,
8845
+ "grad_norm": 81.63748931884766,
8846
+ "learning_rate": 9.978347383552611e-06,
8847
+ "loss": 11.074,
8848
+ "step": 12490
8849
+ },
8850
+ {
8851
+ "epoch": 0.5547511663643273,
8852
+ "grad_norm": 81.70726776123047,
8853
+ "learning_rate": 9.978330047590683e-06,
8854
+ "loss": 10.3868,
8855
+ "step": 12500
8856
  }
8857
  ],
8858
  "logging_steps": 10,
 
8872
  "attributes": {}
8873
  }
8874
  },
8875
+ "total_flos": 4.36215400628224e+18,
8876
  "train_batch_size": 4,
8877
  "trial_name": null,
8878
  "trial_params": null