kurtpayne commited on
Commit
6bb5517
·
verified ·
1 Parent(s): 0ad744a

Training in progress, epoch 6, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a73a838267ef1240e7d2b13c9bef0777c40e5adbde0b41eaad2e2285a1456e59
3
  size 41326816
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:112bc0f3b1b2dc5eb8f630066e354cf482957d3ac748abe391ee31395fad762e
3
  size 41326816
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a06226d090ebf29a224eceb62d58a1bcfdd7269b35eeb66b422c31de8877b2af
3
  size 82710219
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0024a85a2b2d0da2a5ffb75534854bdb966b34ff8920d045a44b4d4e6d074b04
3
  size 82710219
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a5bf161a9696463c6af6428f0b062cd6b9cd46cff929e12497c27ed75936b3a3
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac86114ab63a824512c8a12cd10875b7e54b744aa8a31b22b1e280aabb9491a0
3
  size 14645
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2d2558af33a573a47dbf1fceaa66f5d678ba05e35463e4567df2befb7aba332f
3
  size 1383
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4c83bb7defda8c3f341201fe6246d0e1cf73bcb0df11b3ed7236d93a2e62019
3
  size 1383
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8f30a2df51250c628da6790996d2f11afea3cdcc7e4850ca3d0e75a15719abbb
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:65322c8c832ca6bf70d61640f5bbef4c3ecd471fce4e418dae14f76a3034d8bd
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "best_global_step": 7641,
3
- "best_metric": 0.2098543792963028,
4
- "best_model_checkpoint": "/tmp/tmpxojw69__/adapter-multilabel/checkpoint-7641",
5
- "epoch": 5.0,
6
  "eval_steps": 500,
7
- "global_step": 12735,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -8959,6 +8959,1799 @@
8959
  "eval_samples_per_second": 200.007,
8960
  "eval_steps_per_second": 25.001,
8961
  "step": 12735
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8962
  }
8963
  ],
8964
  "logging_steps": 10,
@@ -8973,7 +10766,7 @@
8973
  "early_stopping_threshold": 0.0
8974
  },
8975
  "attributes": {
8976
- "early_stopping_patience_counter": 2
8977
  }
8978
  },
8979
  "TrainerControl": {
@@ -8982,12 +10775,12 @@
8982
  "should_evaluate": false,
8983
  "should_log": false,
8984
  "should_save": true,
8985
- "should_training_stop": false
8986
  },
8987
  "attributes": {}
8988
  }
8989
  },
8990
- "total_flos": 2.8453099461666144e+16,
8991
  "train_batch_size": 8,
8992
  "trial_name": null,
8993
  "trial_params": null
 
1
  {
2
+ "best_global_step": 15282,
3
+ "best_metric": 0.19693690538406372,
4
+ "best_model_checkpoint": "/tmp/tmpxojw69__/adapter-multilabel/checkpoint-15282",
5
+ "epoch": 6.0,
6
  "eval_steps": 500,
7
+ "global_step": 15282,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
8959
  "eval_samples_per_second": 200.007,
8960
  "eval_steps_per_second": 25.001,
8961
  "step": 12735
8962
+ },
8963
+ {
8964
+ "epoch": 5.001963093835886,
8965
+ "grad_norm": 1.4495255947113037,
8966
+ "learning_rate": 2.773576674180179e-06,
8967
+ "loss": 0.0809,
8968
+ "step": 12740
8969
+ },
8970
+ {
8971
+ "epoch": 5.005889281507656,
8972
+ "grad_norm": 1.010282039642334,
8973
+ "learning_rate": 2.7626699629171816e-06,
8974
+ "loss": 0.0899,
8975
+ "step": 12750
8976
+ },
8977
+ {
8978
+ "epoch": 5.0098154691794266,
8979
+ "grad_norm": 0.13367398083209991,
8980
+ "learning_rate": 2.7517632516541846e-06,
8981
+ "loss": 0.0142,
8982
+ "step": 12760
8983
+ },
8984
+ {
8985
+ "epoch": 5.013741656851198,
8986
+ "grad_norm": 17.02937126159668,
8987
+ "learning_rate": 2.740856540391187e-06,
8988
+ "loss": 0.3372,
8989
+ "step": 12770
8990
+ },
8991
+ {
8992
+ "epoch": 5.017667844522968,
8993
+ "grad_norm": 0.005055473186075687,
8994
+ "learning_rate": 2.7299498291281905e-06,
8995
+ "loss": 0.0114,
8996
+ "step": 12780
8997
+ },
8998
+ {
8999
+ "epoch": 5.021594032194739,
9000
+ "grad_norm": 0.22540545463562012,
9001
+ "learning_rate": 2.719043117865193e-06,
9002
+ "loss": 0.1835,
9003
+ "step": 12790
9004
+ },
9005
+ {
9006
+ "epoch": 5.025520219866509,
9007
+ "grad_norm": 9.437993049621582,
9008
+ "learning_rate": 2.708136406602196e-06,
9009
+ "loss": 0.0482,
9010
+ "step": 12800
9011
+ },
9012
+ {
9013
+ "epoch": 5.0294464075382805,
9014
+ "grad_norm": 0.912726879119873,
9015
+ "learning_rate": 2.6972296953391986e-06,
9016
+ "loss": 0.0343,
9017
+ "step": 12810
9018
+ },
9019
+ {
9020
+ "epoch": 5.033372595210051,
9021
+ "grad_norm": 0.3685736656188965,
9022
+ "learning_rate": 2.6863229840762016e-06,
9023
+ "loss": 0.0036,
9024
+ "step": 12820
9025
+ },
9026
+ {
9027
+ "epoch": 5.037298782881821,
9028
+ "grad_norm": 0.060967061668634415,
9029
+ "learning_rate": 2.6754162728132046e-06,
9030
+ "loss": 0.0128,
9031
+ "step": 12830
9032
+ },
9033
+ {
9034
+ "epoch": 5.041224970553593,
9035
+ "grad_norm": 0.17684897780418396,
9036
+ "learning_rate": 2.664509561550207e-06,
9037
+ "loss": 0.0706,
9038
+ "step": 12840
9039
+ },
9040
+ {
9041
+ "epoch": 5.045151158225363,
9042
+ "grad_norm": 0.45533451437950134,
9043
+ "learning_rate": 2.6536028502872105e-06,
9044
+ "loss": 0.1257,
9045
+ "step": 12850
9046
+ },
9047
+ {
9048
+ "epoch": 5.049077345897134,
9049
+ "grad_norm": 23.107601165771484,
9050
+ "learning_rate": 2.642696139024213e-06,
9051
+ "loss": 0.1421,
9052
+ "step": 12860
9053
+ },
9054
+ {
9055
+ "epoch": 5.053003533568905,
9056
+ "grad_norm": 0.9842678308486938,
9057
+ "learning_rate": 2.631789427761216e-06,
9058
+ "loss": 0.0861,
9059
+ "step": 12870
9060
+ },
9061
+ {
9062
+ "epoch": 5.056929721240675,
9063
+ "grad_norm": 0.4415569305419922,
9064
+ "learning_rate": 2.6208827164982186e-06,
9065
+ "loss": 0.0216,
9066
+ "step": 12880
9067
+ },
9068
+ {
9069
+ "epoch": 5.060855908912446,
9070
+ "grad_norm": 0.09013790637254715,
9071
+ "learning_rate": 2.6099760052352216e-06,
9072
+ "loss": 0.3773,
9073
+ "step": 12890
9074
+ },
9075
+ {
9076
+ "epoch": 5.064782096584216,
9077
+ "grad_norm": 2.492323875427246,
9078
+ "learning_rate": 2.599069293972224e-06,
9079
+ "loss": 0.3093,
9080
+ "step": 12900
9081
+ },
9082
+ {
9083
+ "epoch": 5.068708284255988,
9084
+ "grad_norm": 0.0014120221603661776,
9085
+ "learning_rate": 2.588162582709227e-06,
9086
+ "loss": 0.2295,
9087
+ "step": 12910
9088
+ },
9089
+ {
9090
+ "epoch": 5.072634471927758,
9091
+ "grad_norm": 0.6903073191642761,
9092
+ "learning_rate": 2.5772558714462297e-06,
9093
+ "loss": 0.0299,
9094
+ "step": 12920
9095
+ },
9096
+ {
9097
+ "epoch": 5.0765606595995285,
9098
+ "grad_norm": 0.5149281024932861,
9099
+ "learning_rate": 2.566349160183233e-06,
9100
+ "loss": 0.0038,
9101
+ "step": 12930
9102
+ },
9103
+ {
9104
+ "epoch": 5.0804868472713,
9105
+ "grad_norm": 0.1371137499809265,
9106
+ "learning_rate": 2.5554424489202356e-06,
9107
+ "loss": 0.2088,
9108
+ "step": 12940
9109
+ },
9110
+ {
9111
+ "epoch": 5.08441303494307,
9112
+ "grad_norm": 70.09143829345703,
9113
+ "learning_rate": 2.5445357376572386e-06,
9114
+ "loss": 0.0551,
9115
+ "step": 12950
9116
+ },
9117
+ {
9118
+ "epoch": 5.088339222614841,
9119
+ "grad_norm": 0.018704677000641823,
9120
+ "learning_rate": 2.533629026394241e-06,
9121
+ "loss": 0.3212,
9122
+ "step": 12960
9123
+ },
9124
+ {
9125
+ "epoch": 5.092265410286612,
9126
+ "grad_norm": 5.903232574462891,
9127
+ "learning_rate": 2.522722315131244e-06,
9128
+ "loss": 0.2351,
9129
+ "step": 12970
9130
+ },
9131
+ {
9132
+ "epoch": 5.0961915979583825,
9133
+ "grad_norm": 0.45784613490104675,
9134
+ "learning_rate": 2.511815603868247e-06,
9135
+ "loss": 0.1325,
9136
+ "step": 12980
9137
+ },
9138
+ {
9139
+ "epoch": 5.100117785630153,
9140
+ "grad_norm": 0.3939746916294098,
9141
+ "learning_rate": 2.5009088926052496e-06,
9142
+ "loss": 0.0646,
9143
+ "step": 12990
9144
+ },
9145
+ {
9146
+ "epoch": 5.104043973301923,
9147
+ "grad_norm": 0.36335885524749756,
9148
+ "learning_rate": 2.490002181342253e-06,
9149
+ "loss": 0.2026,
9150
+ "step": 13000
9151
+ },
9152
+ {
9153
+ "epoch": 5.107970160973695,
9154
+ "grad_norm": 4.986391544342041,
9155
+ "learning_rate": 2.4790954700792556e-06,
9156
+ "loss": 0.1158,
9157
+ "step": 13010
9158
+ },
9159
+ {
9160
+ "epoch": 5.111896348645465,
9161
+ "grad_norm": 0.18488813936710358,
9162
+ "learning_rate": 2.4681887588162586e-06,
9163
+ "loss": 0.0048,
9164
+ "step": 13020
9165
+ },
9166
+ {
9167
+ "epoch": 5.115822536317236,
9168
+ "grad_norm": 3.7534120082855225,
9169
+ "learning_rate": 2.457282047553261e-06,
9170
+ "loss": 0.0204,
9171
+ "step": 13030
9172
+ },
9173
+ {
9174
+ "epoch": 5.119748723989007,
9175
+ "grad_norm": 0.1561623513698578,
9176
+ "learning_rate": 2.446375336290264e-06,
9177
+ "loss": 0.0287,
9178
+ "step": 13040
9179
+ },
9180
+ {
9181
+ "epoch": 5.123674911660777,
9182
+ "grad_norm": 0.003959027584642172,
9183
+ "learning_rate": 2.4354686250272667e-06,
9184
+ "loss": 0.1339,
9185
+ "step": 13050
9186
+ },
9187
+ {
9188
+ "epoch": 5.127601099332548,
9189
+ "grad_norm": 2.6318511962890625,
9190
+ "learning_rate": 2.4245619137642696e-06,
9191
+ "loss": 0.1513,
9192
+ "step": 13060
9193
+ },
9194
+ {
9195
+ "epoch": 5.131527287004319,
9196
+ "grad_norm": 23.85380744934082,
9197
+ "learning_rate": 2.413655202501272e-06,
9198
+ "loss": 0.0467,
9199
+ "step": 13070
9200
+ },
9201
+ {
9202
+ "epoch": 5.13545347467609,
9203
+ "grad_norm": 2.4958913326263428,
9204
+ "learning_rate": 2.4027484912382756e-06,
9205
+ "loss": 0.1685,
9206
+ "step": 13080
9207
+ },
9208
+ {
9209
+ "epoch": 5.13937966234786,
9210
+ "grad_norm": 4.380481719970703,
9211
+ "learning_rate": 2.391841779975278e-06,
9212
+ "loss": 0.0426,
9213
+ "step": 13090
9214
+ },
9215
+ {
9216
+ "epoch": 5.143305850019631,
9217
+ "grad_norm": 0.0490720272064209,
9218
+ "learning_rate": 2.380935068712281e-06,
9219
+ "loss": 0.1564,
9220
+ "step": 13100
9221
+ },
9222
+ {
9223
+ "epoch": 5.147232037691402,
9224
+ "grad_norm": 1.1204633712768555,
9225
+ "learning_rate": 2.3700283574492837e-06,
9226
+ "loss": 0.1093,
9227
+ "step": 13110
9228
+ },
9229
+ {
9230
+ "epoch": 5.151158225363172,
9231
+ "grad_norm": 0.0012298759538680315,
9232
+ "learning_rate": 2.3591216461862866e-06,
9233
+ "loss": 0.6134,
9234
+ "step": 13120
9235
+ },
9236
+ {
9237
+ "epoch": 5.155084413034943,
9238
+ "grad_norm": 0.34278765320777893,
9239
+ "learning_rate": 2.348214934923289e-06,
9240
+ "loss": 0.1132,
9241
+ "step": 13130
9242
+ },
9243
+ {
9244
+ "epoch": 5.159010600706714,
9245
+ "grad_norm": 0.8987643718719482,
9246
+ "learning_rate": 2.337308223660292e-06,
9247
+ "loss": 0.0614,
9248
+ "step": 13140
9249
+ },
9250
+ {
9251
+ "epoch": 5.1629367883784845,
9252
+ "grad_norm": 236.4158935546875,
9253
+ "learning_rate": 2.3264015123972956e-06,
9254
+ "loss": 0.2959,
9255
+ "step": 13150
9256
+ },
9257
+ {
9258
+ "epoch": 5.166862976050255,
9259
+ "grad_norm": 0.5242019891738892,
9260
+ "learning_rate": 2.315494801134298e-06,
9261
+ "loss": 0.1277,
9262
+ "step": 13160
9263
+ },
9264
+ {
9265
+ "epoch": 5.170789163722026,
9266
+ "grad_norm": 3.4700469970703125,
9267
+ "learning_rate": 2.304588089871301e-06,
9268
+ "loss": 0.2921,
9269
+ "step": 13170
9270
+ },
9271
+ {
9272
+ "epoch": 5.174715351393797,
9273
+ "grad_norm": 0.07389583438634872,
9274
+ "learning_rate": 2.2936813786083037e-06,
9275
+ "loss": 0.1182,
9276
+ "step": 13180
9277
+ },
9278
+ {
9279
+ "epoch": 5.178641539065567,
9280
+ "grad_norm": 0.9954978823661804,
9281
+ "learning_rate": 2.2827746673453066e-06,
9282
+ "loss": 0.1724,
9283
+ "step": 13190
9284
+ },
9285
+ {
9286
+ "epoch": 5.1825677267373385,
9287
+ "grad_norm": 1.2880812883377075,
9288
+ "learning_rate": 2.271867956082309e-06,
9289
+ "loss": 0.1925,
9290
+ "step": 13200
9291
+ },
9292
+ {
9293
+ "epoch": 5.186493914409109,
9294
+ "grad_norm": 0.2947271764278412,
9295
+ "learning_rate": 2.260961244819312e-06,
9296
+ "loss": 0.0891,
9297
+ "step": 13210
9298
+ },
9299
+ {
9300
+ "epoch": 5.190420102080879,
9301
+ "grad_norm": 1.8062124252319336,
9302
+ "learning_rate": 2.250054533556315e-06,
9303
+ "loss": 0.1711,
9304
+ "step": 13220
9305
+ },
9306
+ {
9307
+ "epoch": 5.19434628975265,
9308
+ "grad_norm": 1.5765048265457153,
9309
+ "learning_rate": 2.239147822293318e-06,
9310
+ "loss": 0.2276,
9311
+ "step": 13230
9312
+ },
9313
+ {
9314
+ "epoch": 5.198272477424421,
9315
+ "grad_norm": 36.99755859375,
9316
+ "learning_rate": 2.2282411110303207e-06,
9317
+ "loss": 0.0268,
9318
+ "step": 13240
9319
+ },
9320
+ {
9321
+ "epoch": 5.2021986650961916,
9322
+ "grad_norm": 0.04058028385043144,
9323
+ "learning_rate": 2.2173343997673236e-06,
9324
+ "loss": 0.0637,
9325
+ "step": 13250
9326
+ },
9327
+ {
9328
+ "epoch": 5.206124852767962,
9329
+ "grad_norm": 0.8288754820823669,
9330
+ "learning_rate": 2.206427688504326e-06,
9331
+ "loss": 0.3138,
9332
+ "step": 13260
9333
+ },
9334
+ {
9335
+ "epoch": 5.210051040439733,
9336
+ "grad_norm": 19.756397247314453,
9337
+ "learning_rate": 2.195520977241329e-06,
9338
+ "loss": 0.0867,
9339
+ "step": 13270
9340
+ },
9341
+ {
9342
+ "epoch": 5.213977228111504,
9343
+ "grad_norm": 0.07695566117763519,
9344
+ "learning_rate": 2.1846142659783317e-06,
9345
+ "loss": 0.021,
9346
+ "step": 13280
9347
+ },
9348
+ {
9349
+ "epoch": 5.217903415783274,
9350
+ "grad_norm": 0.0034018950536847115,
9351
+ "learning_rate": 2.1737075547153347e-06,
9352
+ "loss": 0.0148,
9353
+ "step": 13290
9354
+ },
9355
+ {
9356
+ "epoch": 5.2218296034550455,
9357
+ "grad_norm": 0.2699490487575531,
9358
+ "learning_rate": 2.162800843452338e-06,
9359
+ "loss": 0.1326,
9360
+ "step": 13300
9361
+ },
9362
+ {
9363
+ "epoch": 5.225755791126816,
9364
+ "grad_norm": 0.3341562747955322,
9365
+ "learning_rate": 2.1518941321893407e-06,
9366
+ "loss": 0.0725,
9367
+ "step": 13310
9368
+ },
9369
+ {
9370
+ "epoch": 5.229681978798586,
9371
+ "grad_norm": 1.3519041538238525,
9372
+ "learning_rate": 2.1409874209263436e-06,
9373
+ "loss": 0.4665,
9374
+ "step": 13320
9375
+ },
9376
+ {
9377
+ "epoch": 5.233608166470357,
9378
+ "grad_norm": 1.233429193496704,
9379
+ "learning_rate": 2.130080709663346e-06,
9380
+ "loss": 0.0432,
9381
+ "step": 13330
9382
+ },
9383
+ {
9384
+ "epoch": 5.237534354142128,
9385
+ "grad_norm": 0.17115157842636108,
9386
+ "learning_rate": 2.119173998400349e-06,
9387
+ "loss": 0.0434,
9388
+ "step": 13340
9389
+ },
9390
+ {
9391
+ "epoch": 5.241460541813899,
9392
+ "grad_norm": 0.15809869766235352,
9393
+ "learning_rate": 2.1082672871373517e-06,
9394
+ "loss": 0.0168,
9395
+ "step": 13350
9396
+ },
9397
+ {
9398
+ "epoch": 5.245386729485669,
9399
+ "grad_norm": 0.15708252787590027,
9400
+ "learning_rate": 2.0973605758743547e-06,
9401
+ "loss": 0.1109,
9402
+ "step": 13360
9403
+ },
9404
+ {
9405
+ "epoch": 5.24931291715744,
9406
+ "grad_norm": 0.01780679263174534,
9407
+ "learning_rate": 2.0864538646113577e-06,
9408
+ "loss": 0.2882,
9409
+ "step": 13370
9410
+ },
9411
+ {
9412
+ "epoch": 5.253239104829211,
9413
+ "grad_norm": 0.4691922068595886,
9414
+ "learning_rate": 2.0755471533483606e-06,
9415
+ "loss": 0.0458,
9416
+ "step": 13380
9417
+ },
9418
+ {
9419
+ "epoch": 5.257165292500981,
9420
+ "grad_norm": 3.2137491703033447,
9421
+ "learning_rate": 2.064640442085363e-06,
9422
+ "loss": 0.075,
9423
+ "step": 13390
9424
+ },
9425
+ {
9426
+ "epoch": 5.261091480172753,
9427
+ "grad_norm": 0.05457557737827301,
9428
+ "learning_rate": 2.053733730822366e-06,
9429
+ "loss": 0.0244,
9430
+ "step": 13400
9431
+ },
9432
+ {
9433
+ "epoch": 5.265017667844523,
9434
+ "grad_norm": 3.349138021469116,
9435
+ "learning_rate": 2.0428270195593687e-06,
9436
+ "loss": 0.0189,
9437
+ "step": 13410
9438
+ },
9439
+ {
9440
+ "epoch": 5.2689438555162935,
9441
+ "grad_norm": 0.4423637092113495,
9442
+ "learning_rate": 2.0319203082963717e-06,
9443
+ "loss": 0.0515,
9444
+ "step": 13420
9445
+ },
9446
+ {
9447
+ "epoch": 5.272870043188064,
9448
+ "grad_norm": 0.2601698637008667,
9449
+ "learning_rate": 2.0210135970333743e-06,
9450
+ "loss": 0.0444,
9451
+ "step": 13430
9452
+ },
9453
+ {
9454
+ "epoch": 5.276796230859835,
9455
+ "grad_norm": 2.402855396270752,
9456
+ "learning_rate": 2.0101068857703777e-06,
9457
+ "loss": 0.2045,
9458
+ "step": 13440
9459
+ },
9460
+ {
9461
+ "epoch": 5.280722418531606,
9462
+ "grad_norm": 3.287405490875244,
9463
+ "learning_rate": 1.99920017450738e-06,
9464
+ "loss": 0.0396,
9465
+ "step": 13450
9466
+ },
9467
+ {
9468
+ "epoch": 5.284648606203376,
9469
+ "grad_norm": 1.1102228164672852,
9470
+ "learning_rate": 1.988293463244383e-06,
9471
+ "loss": 0.3582,
9472
+ "step": 13460
9473
+ },
9474
+ {
9475
+ "epoch": 5.2885747938751475,
9476
+ "grad_norm": 0.3136877119541168,
9477
+ "learning_rate": 1.977386751981386e-06,
9478
+ "loss": 0.1289,
9479
+ "step": 13470
9480
+ },
9481
+ {
9482
+ "epoch": 5.292500981546918,
9483
+ "grad_norm": 0.09412948787212372,
9484
+ "learning_rate": 1.9664800407183887e-06,
9485
+ "loss": 0.0135,
9486
+ "step": 13480
9487
+ },
9488
+ {
9489
+ "epoch": 5.296427169218688,
9490
+ "grad_norm": 70.45732116699219,
9491
+ "learning_rate": 1.9555733294553917e-06,
9492
+ "loss": 0.437,
9493
+ "step": 13490
9494
+ },
9495
+ {
9496
+ "epoch": 5.30035335689046,
9497
+ "grad_norm": 13.51106071472168,
9498
+ "learning_rate": 1.9446666181923942e-06,
9499
+ "loss": 0.1514,
9500
+ "step": 13500
9501
+ },
9502
+ {
9503
+ "epoch": 5.30427954456223,
9504
+ "grad_norm": 0.05329679697751999,
9505
+ "learning_rate": 1.9337599069293972e-06,
9506
+ "loss": 0.2202,
9507
+ "step": 13510
9508
+ },
9509
+ {
9510
+ "epoch": 5.308205732234001,
9511
+ "grad_norm": 0.18707284331321716,
9512
+ "learning_rate": 1.9228531956664e-06,
9513
+ "loss": 0.0637,
9514
+ "step": 13520
9515
+ },
9516
+ {
9517
+ "epoch": 5.312131919905772,
9518
+ "grad_norm": 22.817306518554688,
9519
+ "learning_rate": 1.911946484403403e-06,
9520
+ "loss": 0.217,
9521
+ "step": 13530
9522
+ },
9523
+ {
9524
+ "epoch": 5.316058107577542,
9525
+ "grad_norm": 0.5910695791244507,
9526
+ "learning_rate": 1.901039773140406e-06,
9527
+ "loss": 0.1745,
9528
+ "step": 13540
9529
+ },
9530
+ {
9531
+ "epoch": 5.319984295249313,
9532
+ "grad_norm": 0.7026278972625732,
9533
+ "learning_rate": 1.8901330618774087e-06,
9534
+ "loss": 0.0093,
9535
+ "step": 13550
9536
+ },
9537
+ {
9538
+ "epoch": 5.323910482921083,
9539
+ "grad_norm": 1.2804770469665527,
9540
+ "learning_rate": 1.8792263506144115e-06,
9541
+ "loss": 0.1142,
9542
+ "step": 13560
9543
+ },
9544
+ {
9545
+ "epoch": 5.327836670592855,
9546
+ "grad_norm": 0.17652656137943268,
9547
+ "learning_rate": 1.8683196393514144e-06,
9548
+ "loss": 0.1111,
9549
+ "step": 13570
9550
+ },
9551
+ {
9552
+ "epoch": 5.331762858264625,
9553
+ "grad_norm": 31.41547393798828,
9554
+ "learning_rate": 1.8574129280884172e-06,
9555
+ "loss": 0.2906,
9556
+ "step": 13580
9557
+ },
9558
+ {
9559
+ "epoch": 5.3356890459363955,
9560
+ "grad_norm": 0.23575662076473236,
9561
+ "learning_rate": 1.84650621682542e-06,
9562
+ "loss": 0.2445,
9563
+ "step": 13590
9564
+ },
9565
+ {
9566
+ "epoch": 5.339615233608167,
9567
+ "grad_norm": 0.32999148964881897,
9568
+ "learning_rate": 1.8355995055624227e-06,
9569
+ "loss": 0.1179,
9570
+ "step": 13600
9571
+ },
9572
+ {
9573
+ "epoch": 5.343541421279937,
9574
+ "grad_norm": 0.01785041019320488,
9575
+ "learning_rate": 1.8246927942994257e-06,
9576
+ "loss": 0.0075,
9577
+ "step": 13610
9578
+ },
9579
+ {
9580
+ "epoch": 5.347467608951708,
9581
+ "grad_norm": 2.5955681800842285,
9582
+ "learning_rate": 1.8137860830364285e-06,
9583
+ "loss": 0.1413,
9584
+ "step": 13620
9585
+ },
9586
+ {
9587
+ "epoch": 5.351393796623478,
9588
+ "grad_norm": 29.785816192626953,
9589
+ "learning_rate": 1.8028793717734312e-06,
9590
+ "loss": 0.0787,
9591
+ "step": 13630
9592
+ },
9593
+ {
9594
+ "epoch": 5.3553199842952495,
9595
+ "grad_norm": 0.041280996054410934,
9596
+ "learning_rate": 1.791972660510434e-06,
9597
+ "loss": 0.0299,
9598
+ "step": 13640
9599
+ },
9600
+ {
9601
+ "epoch": 5.35924617196702,
9602
+ "grad_norm": 2.446070432662964,
9603
+ "learning_rate": 1.781065949247437e-06,
9604
+ "loss": 0.2202,
9605
+ "step": 13650
9606
+ },
9607
+ {
9608
+ "epoch": 5.36317235963879,
9609
+ "grad_norm": 0.14916792511940002,
9610
+ "learning_rate": 1.7701592379844397e-06,
9611
+ "loss": 0.0731,
9612
+ "step": 13660
9613
+ },
9614
+ {
9615
+ "epoch": 5.367098547310562,
9616
+ "grad_norm": 78.42562866210938,
9617
+ "learning_rate": 1.7592525267214425e-06,
9618
+ "loss": 0.2163,
9619
+ "step": 13670
9620
+ },
9621
+ {
9622
+ "epoch": 5.371024734982332,
9623
+ "grad_norm": 69.78839874267578,
9624
+ "learning_rate": 1.7483458154584455e-06,
9625
+ "loss": 0.2863,
9626
+ "step": 13680
9627
+ },
9628
+ {
9629
+ "epoch": 5.374950922654103,
9630
+ "grad_norm": 0.006996185053139925,
9631
+ "learning_rate": 1.7374391041954483e-06,
9632
+ "loss": 0.0782,
9633
+ "step": 13690
9634
+ },
9635
+ {
9636
+ "epoch": 5.378877110325874,
9637
+ "grad_norm": 113.34099578857422,
9638
+ "learning_rate": 1.7265323929324512e-06,
9639
+ "loss": 0.2028,
9640
+ "step": 13700
9641
+ },
9642
+ {
9643
+ "epoch": 5.382803297997644,
9644
+ "grad_norm": 12.9867582321167,
9645
+ "learning_rate": 1.715625681669454e-06,
9646
+ "loss": 0.2298,
9647
+ "step": 13710
9648
+ },
9649
+ {
9650
+ "epoch": 5.386729485669415,
9651
+ "grad_norm": 1.6531518697738647,
9652
+ "learning_rate": 1.704718970406457e-06,
9653
+ "loss": 0.0999,
9654
+ "step": 13720
9655
+ },
9656
+ {
9657
+ "epoch": 5.390655673341186,
9658
+ "grad_norm": 18.483884811401367,
9659
+ "learning_rate": 1.6938122591434597e-06,
9660
+ "loss": 0.0566,
9661
+ "step": 13730
9662
+ },
9663
+ {
9664
+ "epoch": 5.394581861012957,
9665
+ "grad_norm": 0.0013142261886969209,
9666
+ "learning_rate": 1.6829055478804625e-06,
9667
+ "loss": 0.0257,
9668
+ "step": 13740
9669
+ },
9670
+ {
9671
+ "epoch": 5.398508048684727,
9672
+ "grad_norm": 1.1040148735046387,
9673
+ "learning_rate": 1.6719988366174653e-06,
9674
+ "loss": 0.1075,
9675
+ "step": 13750
9676
+ },
9677
+ {
9678
+ "epoch": 5.402434236356497,
9679
+ "grad_norm": 0.766169011592865,
9680
+ "learning_rate": 1.6610921253544682e-06,
9681
+ "loss": 0.0146,
9682
+ "step": 13760
9683
+ },
9684
+ {
9685
+ "epoch": 5.406360424028269,
9686
+ "grad_norm": 0.00455129100009799,
9687
+ "learning_rate": 1.650185414091471e-06,
9688
+ "loss": 0.191,
9689
+ "step": 13770
9690
+ },
9691
+ {
9692
+ "epoch": 5.410286611700039,
9693
+ "grad_norm": 0.04159601777791977,
9694
+ "learning_rate": 1.6392787028284738e-06,
9695
+ "loss": 0.081,
9696
+ "step": 13780
9697
+ },
9698
+ {
9699
+ "epoch": 5.41421279937181,
9700
+ "grad_norm": 0.2454538196325302,
9701
+ "learning_rate": 1.6283719915654767e-06,
9702
+ "loss": 0.1503,
9703
+ "step": 13790
9704
+ },
9705
+ {
9706
+ "epoch": 5.418138987043581,
9707
+ "grad_norm": 3.542814016342163,
9708
+ "learning_rate": 1.6174652803024795e-06,
9709
+ "loss": 0.0179,
9710
+ "step": 13800
9711
+ },
9712
+ {
9713
+ "epoch": 5.422065174715351,
9714
+ "grad_norm": 0.015254409052431583,
9715
+ "learning_rate": 1.6065585690394823e-06,
9716
+ "loss": 0.0264,
9717
+ "step": 13810
9718
+ },
9719
+ {
9720
+ "epoch": 5.425991362387122,
9721
+ "grad_norm": 1.0630416870117188,
9722
+ "learning_rate": 1.595651857776485e-06,
9723
+ "loss": 0.212,
9724
+ "step": 13820
9725
+ },
9726
+ {
9727
+ "epoch": 5.429917550058893,
9728
+ "grad_norm": 0.8215121030807495,
9729
+ "learning_rate": 1.584745146513488e-06,
9730
+ "loss": 0.0404,
9731
+ "step": 13830
9732
+ },
9733
+ {
9734
+ "epoch": 5.433843737730664,
9735
+ "grad_norm": 2.257424831390381,
9736
+ "learning_rate": 1.5738384352504908e-06,
9737
+ "loss": 0.1186,
9738
+ "step": 13840
9739
+ },
9740
+ {
9741
+ "epoch": 5.437769925402434,
9742
+ "grad_norm": 0.004584351554512978,
9743
+ "learning_rate": 1.5629317239874935e-06,
9744
+ "loss": 0.3232,
9745
+ "step": 13850
9746
+ },
9747
+ {
9748
+ "epoch": 5.4416961130742045,
9749
+ "grad_norm": 0.18819616734981537,
9750
+ "learning_rate": 1.5520250127244965e-06,
9751
+ "loss": 0.0491,
9752
+ "step": 13860
9753
+ },
9754
+ {
9755
+ "epoch": 5.445622300745976,
9756
+ "grad_norm": 52.796051025390625,
9757
+ "learning_rate": 1.5411183014614995e-06,
9758
+ "loss": 0.1477,
9759
+ "step": 13870
9760
+ },
9761
+ {
9762
+ "epoch": 5.449548488417746,
9763
+ "grad_norm": 0.038888316601514816,
9764
+ "learning_rate": 1.5302115901985023e-06,
9765
+ "loss": 0.1332,
9766
+ "step": 13880
9767
+ },
9768
+ {
9769
+ "epoch": 5.453474676089517,
9770
+ "grad_norm": 25.012880325317383,
9771
+ "learning_rate": 1.519304878935505e-06,
9772
+ "loss": 0.0229,
9773
+ "step": 13890
9774
+ },
9775
+ {
9776
+ "epoch": 5.457400863761288,
9777
+ "grad_norm": 50.476722717285156,
9778
+ "learning_rate": 1.5083981676725078e-06,
9779
+ "loss": 0.2821,
9780
+ "step": 13900
9781
+ },
9782
+ {
9783
+ "epoch": 5.4613270514330585,
9784
+ "grad_norm": 0.554529070854187,
9785
+ "learning_rate": 1.4974914564095108e-06,
9786
+ "loss": 0.1492,
9787
+ "step": 13910
9788
+ },
9789
+ {
9790
+ "epoch": 5.465253239104829,
9791
+ "grad_norm": 39.14360046386719,
9792
+ "learning_rate": 1.4865847451465135e-06,
9793
+ "loss": 0.0737,
9794
+ "step": 13920
9795
+ },
9796
+ {
9797
+ "epoch": 5.4691794267766,
9798
+ "grad_norm": 27.91413688659668,
9799
+ "learning_rate": 1.4756780338835163e-06,
9800
+ "loss": 0.1312,
9801
+ "step": 13930
9802
+ },
9803
+ {
9804
+ "epoch": 5.473105614448371,
9805
+ "grad_norm": 4.997786521911621,
9806
+ "learning_rate": 1.4647713226205193e-06,
9807
+ "loss": 0.1063,
9808
+ "step": 13940
9809
+ },
9810
+ {
9811
+ "epoch": 5.477031802120141,
9812
+ "grad_norm": 3.62794828414917,
9813
+ "learning_rate": 1.453864611357522e-06,
9814
+ "loss": 0.1987,
9815
+ "step": 13950
9816
+ },
9817
+ {
9818
+ "epoch": 5.4809579897919125,
9819
+ "grad_norm": 0.12887300550937653,
9820
+ "learning_rate": 1.4429579000945248e-06,
9821
+ "loss": 0.0639,
9822
+ "step": 13960
9823
+ },
9824
+ {
9825
+ "epoch": 5.484884177463683,
9826
+ "grad_norm": 34.32918930053711,
9827
+ "learning_rate": 1.4320511888315276e-06,
9828
+ "loss": 0.1747,
9829
+ "step": 13970
9830
+ },
9831
+ {
9832
+ "epoch": 5.488810365135453,
9833
+ "grad_norm": 50.748191833496094,
9834
+ "learning_rate": 1.4211444775685305e-06,
9835
+ "loss": 0.2842,
9836
+ "step": 13980
9837
+ },
9838
+ {
9839
+ "epoch": 5.492736552807224,
9840
+ "grad_norm": 0.0012191717978566885,
9841
+ "learning_rate": 1.4102377663055333e-06,
9842
+ "loss": 0.3211,
9843
+ "step": 13990
9844
+ },
9845
+ {
9846
+ "epoch": 5.496662740478995,
9847
+ "grad_norm": 0.11494173109531403,
9848
+ "learning_rate": 1.399331055042536e-06,
9849
+ "loss": 0.0201,
9850
+ "step": 14000
9851
+ },
9852
+ {
9853
+ "epoch": 5.500588928150766,
9854
+ "grad_norm": 0.022875521332025528,
9855
+ "learning_rate": 1.388424343779539e-06,
9856
+ "loss": 0.2013,
9857
+ "step": 14010
9858
+ },
9859
+ {
9860
+ "epoch": 5.504515115822536,
9861
+ "grad_norm": 0.012128263711929321,
9862
+ "learning_rate": 1.377517632516542e-06,
9863
+ "loss": 0.0205,
9864
+ "step": 14020
9865
+ },
9866
+ {
9867
+ "epoch": 5.508441303494307,
9868
+ "grad_norm": 0.1611277312040329,
9869
+ "learning_rate": 1.3666109212535448e-06,
9870
+ "loss": 0.03,
9871
+ "step": 14030
9872
+ },
9873
+ {
9874
+ "epoch": 5.512367491166078,
9875
+ "grad_norm": 2.261368751525879,
9876
+ "learning_rate": 1.3557042099905476e-06,
9877
+ "loss": 0.0399,
9878
+ "step": 14040
9879
+ },
9880
+ {
9881
+ "epoch": 5.516293678837848,
9882
+ "grad_norm": 33.47151184082031,
9883
+ "learning_rate": 1.3447974987275505e-06,
9884
+ "loss": 0.0367,
9885
+ "step": 14050
9886
+ },
9887
+ {
9888
+ "epoch": 5.520219866509619,
9889
+ "grad_norm": 0.10368947684764862,
9890
+ "learning_rate": 1.3338907874645533e-06,
9891
+ "loss": 0.0327,
9892
+ "step": 14060
9893
+ },
9894
+ {
9895
+ "epoch": 5.52414605418139,
9896
+ "grad_norm": 4.692790985107422,
9897
+ "learning_rate": 1.322984076201556e-06,
9898
+ "loss": 0.1582,
9899
+ "step": 14070
9900
+ },
9901
+ {
9902
+ "epoch": 5.5280722418531605,
9903
+ "grad_norm": 0.893932044506073,
9904
+ "learning_rate": 1.3120773649385588e-06,
9905
+ "loss": 0.0526,
9906
+ "step": 14080
9907
+ },
9908
+ {
9909
+ "epoch": 5.531998429524931,
9910
+ "grad_norm": 5.193930625915527,
9911
+ "learning_rate": 1.3011706536755618e-06,
9912
+ "loss": 0.0851,
9913
+ "step": 14090
9914
+ },
9915
+ {
9916
+ "epoch": 5.535924617196702,
9917
+ "grad_norm": 1.462369441986084,
9918
+ "learning_rate": 1.2902639424125646e-06,
9919
+ "loss": 0.1355,
9920
+ "step": 14100
9921
+ },
9922
+ {
9923
+ "epoch": 5.539850804868473,
9924
+ "grad_norm": 0.06339607387781143,
9925
+ "learning_rate": 1.2793572311495673e-06,
9926
+ "loss": 0.1839,
9927
+ "step": 14110
9928
+ },
9929
+ {
9930
+ "epoch": 5.543776992540243,
9931
+ "grad_norm": 0.01694520190358162,
9932
+ "learning_rate": 1.26845051988657e-06,
9933
+ "loss": 0.1235,
9934
+ "step": 14120
9935
+ },
9936
+ {
9937
+ "epoch": 5.5477031802120145,
9938
+ "grad_norm": 0.2765995264053345,
9939
+ "learning_rate": 1.257543808623573e-06,
9940
+ "loss": 0.3504,
9941
+ "step": 14130
9942
+ },
9943
+ {
9944
+ "epoch": 5.551629367883785,
9945
+ "grad_norm": 0.2754051685333252,
9946
+ "learning_rate": 1.2466370973605758e-06,
9947
+ "loss": 0.0768,
9948
+ "step": 14140
9949
+ },
9950
+ {
9951
+ "epoch": 5.555555555555555,
9952
+ "grad_norm": 0.051648326218128204,
9953
+ "learning_rate": 1.2357303860975786e-06,
9954
+ "loss": 0.0335,
9955
+ "step": 14150
9956
+ },
9957
+ {
9958
+ "epoch": 5.559481743227327,
9959
+ "grad_norm": 0.07156907021999359,
9960
+ "learning_rate": 1.2248236748345816e-06,
9961
+ "loss": 0.0572,
9962
+ "step": 14160
9963
+ },
9964
+ {
9965
+ "epoch": 5.563407930899097,
9966
+ "grad_norm": 0.6992261409759521,
9967
+ "learning_rate": 1.2139169635715843e-06,
9968
+ "loss": 0.1361,
9969
+ "step": 14170
9970
+ },
9971
+ {
9972
+ "epoch": 5.567334118570868,
9973
+ "grad_norm": 2.310169219970703,
9974
+ "learning_rate": 1.2030102523085873e-06,
9975
+ "loss": 0.0061,
9976
+ "step": 14180
9977
+ },
9978
+ {
9979
+ "epoch": 5.571260306242638,
9980
+ "grad_norm": 1.5157902240753174,
9981
+ "learning_rate": 1.19210354104559e-06,
9982
+ "loss": 0.0122,
9983
+ "step": 14190
9984
+ },
9985
+ {
9986
+ "epoch": 5.575186493914409,
9987
+ "grad_norm": 29.78800392150879,
9988
+ "learning_rate": 1.181196829782593e-06,
9989
+ "loss": 0.1933,
9990
+ "step": 14200
9991
+ },
9992
+ {
9993
+ "epoch": 5.57911268158618,
9994
+ "grad_norm": 75.67921447753906,
9995
+ "learning_rate": 1.1702901185195958e-06,
9996
+ "loss": 0.2681,
9997
+ "step": 14210
9998
+ },
9999
+ {
10000
+ "epoch": 5.58303886925795,
10001
+ "grad_norm": 0.10809934139251709,
10002
+ "learning_rate": 1.1593834072565986e-06,
10003
+ "loss": 0.1424,
10004
+ "step": 14220
10005
+ },
10006
+ {
10007
+ "epoch": 5.586965056929722,
10008
+ "grad_norm": 0.010167748667299747,
10009
+ "learning_rate": 1.1484766959936014e-06,
10010
+ "loss": 0.1019,
10011
+ "step": 14230
10012
+ },
10013
+ {
10014
+ "epoch": 5.590891244601492,
10015
+ "grad_norm": 0.03884586691856384,
10016
+ "learning_rate": 1.1375699847306043e-06,
10017
+ "loss": 0.1516,
10018
+ "step": 14240
10019
+ },
10020
+ {
10021
+ "epoch": 5.5948174322732624,
10022
+ "grad_norm": 0.25132858753204346,
10023
+ "learning_rate": 1.126663273467607e-06,
10024
+ "loss": 0.0585,
10025
+ "step": 14250
10026
+ },
10027
+ {
10028
+ "epoch": 5.598743619945033,
10029
+ "grad_norm": 11.373438835144043,
10030
+ "learning_rate": 1.1157565622046099e-06,
10031
+ "loss": 0.1883,
10032
+ "step": 14260
10033
+ },
10034
+ {
10035
+ "epoch": 5.602669807616804,
10036
+ "grad_norm": 24.09672737121582,
10037
+ "learning_rate": 1.1048498509416128e-06,
10038
+ "loss": 0.1442,
10039
+ "step": 14270
10040
+ },
10041
+ {
10042
+ "epoch": 5.606595995288575,
10043
+ "grad_norm": 8.562515258789062,
10044
+ "learning_rate": 1.0939431396786156e-06,
10045
+ "loss": 0.2073,
10046
+ "step": 14280
10047
+ },
10048
+ {
10049
+ "epoch": 5.610522182960345,
10050
+ "grad_norm": 6.606204509735107,
10051
+ "learning_rate": 1.0830364284156184e-06,
10052
+ "loss": 0.0234,
10053
+ "step": 14290
10054
+ },
10055
+ {
10056
+ "epoch": 5.614448370632116,
10057
+ "grad_norm": 0.007998577319085598,
10058
+ "learning_rate": 1.0721297171526211e-06,
10059
+ "loss": 0.1604,
10060
+ "step": 14300
10061
+ },
10062
+ {
10063
+ "epoch": 5.618374558303887,
10064
+ "grad_norm": 5.682195663452148,
10065
+ "learning_rate": 1.0612230058896241e-06,
10066
+ "loss": 0.0596,
10067
+ "step": 14310
10068
+ },
10069
+ {
10070
+ "epoch": 5.622300745975657,
10071
+ "grad_norm": 20.193941116333008,
10072
+ "learning_rate": 1.0503162946266269e-06,
10073
+ "loss": 0.0647,
10074
+ "step": 14320
10075
+ },
10076
+ {
10077
+ "epoch": 5.626226933647429,
10078
+ "grad_norm": 0.019956370815634727,
10079
+ "learning_rate": 1.0394095833636299e-06,
10080
+ "loss": 0.0364,
10081
+ "step": 14330
10082
+ },
10083
+ {
10084
+ "epoch": 5.630153121319199,
10085
+ "grad_norm": 2.3317253589630127,
10086
+ "learning_rate": 1.0285028721006326e-06,
10087
+ "loss": 0.3789,
10088
+ "step": 14340
10089
+ },
10090
+ {
10091
+ "epoch": 5.6340793089909695,
10092
+ "grad_norm": 0.5026030540466309,
10093
+ "learning_rate": 1.0175961608376356e-06,
10094
+ "loss": 0.025,
10095
+ "step": 14350
10096
+ },
10097
+ {
10098
+ "epoch": 5.638005496662741,
10099
+ "grad_norm": 6.219483375549316,
10100
+ "learning_rate": 1.0066894495746384e-06,
10101
+ "loss": 0.0894,
10102
+ "step": 14360
10103
+ },
10104
+ {
10105
+ "epoch": 5.641931684334511,
10106
+ "grad_norm": 1.2812358140945435,
10107
+ "learning_rate": 9.957827383116411e-07,
10108
+ "loss": 0.1408,
10109
+ "step": 14370
10110
+ },
10111
+ {
10112
+ "epoch": 5.645857872006282,
10113
+ "grad_norm": 40.860984802246094,
10114
+ "learning_rate": 9.84876027048644e-07,
10115
+ "loss": 0.2064,
10116
+ "step": 14380
10117
+ },
10118
+ {
10119
+ "epoch": 5.649784059678053,
10120
+ "grad_norm": 0.19107870757579803,
10121
+ "learning_rate": 9.739693157856469e-07,
10122
+ "loss": 0.0781,
10123
+ "step": 14390
10124
+ },
10125
+ {
10126
+ "epoch": 5.6537102473498235,
10127
+ "grad_norm": 1.1489980220794678,
10128
+ "learning_rate": 9.630626045226496e-07,
10129
+ "loss": 0.3186,
10130
+ "step": 14400
10131
+ },
10132
+ {
10133
+ "epoch": 5.657636435021594,
10134
+ "grad_norm": 33.495182037353516,
10135
+ "learning_rate": 9.521558932596524e-07,
10136
+ "loss": 0.0795,
10137
+ "step": 14410
10138
+ },
10139
+ {
10140
+ "epoch": 5.661562622693364,
10141
+ "grad_norm": 0.024215303361415863,
10142
+ "learning_rate": 9.412491819966554e-07,
10143
+ "loss": 0.0617,
10144
+ "step": 14420
10145
+ },
10146
+ {
10147
+ "epoch": 5.665488810365136,
10148
+ "grad_norm": 0.06826794147491455,
10149
+ "learning_rate": 9.303424707336581e-07,
10150
+ "loss": 0.1631,
10151
+ "step": 14430
10152
+ },
10153
+ {
10154
+ "epoch": 5.669414998036906,
10155
+ "grad_norm": 0.21508584916591644,
10156
+ "learning_rate": 9.194357594706609e-07,
10157
+ "loss": 0.0656,
10158
+ "step": 14440
10159
+ },
10160
+ {
10161
+ "epoch": 5.673341185708677,
10162
+ "grad_norm": 0.9320672154426575,
10163
+ "learning_rate": 9.085290482076639e-07,
10164
+ "loss": 0.0274,
10165
+ "step": 14450
10166
+ },
10167
+ {
10168
+ "epoch": 5.677267373380448,
10169
+ "grad_norm": 0.06282981485128403,
10170
+ "learning_rate": 8.976223369446666e-07,
10171
+ "loss": 0.009,
10172
+ "step": 14460
10173
+ },
10174
+ {
10175
+ "epoch": 5.681193561052218,
10176
+ "grad_norm": 0.796981155872345,
10177
+ "learning_rate": 8.867156256816695e-07,
10178
+ "loss": 0.0869,
10179
+ "step": 14470
10180
+ },
10181
+ {
10182
+ "epoch": 5.685119748723989,
10183
+ "grad_norm": 0.47704437375068665,
10184
+ "learning_rate": 8.758089144186723e-07,
10185
+ "loss": 0.0558,
10186
+ "step": 14480
10187
+ },
10188
+ {
10189
+ "epoch": 5.689045936395759,
10190
+ "grad_norm": 0.8715189099311829,
10191
+ "learning_rate": 8.649022031556751e-07,
10192
+ "loss": 0.3915,
10193
+ "step": 14490
10194
+ },
10195
+ {
10196
+ "epoch": 5.692972124067531,
10197
+ "grad_norm": 2.462773561477661,
10198
+ "learning_rate": 8.53995491892678e-07,
10199
+ "loss": 0.2456,
10200
+ "step": 14500
10201
+ },
10202
+ {
10203
+ "epoch": 5.696898311739301,
10204
+ "grad_norm": 0.7116551399230957,
10205
+ "learning_rate": 8.430887806296808e-07,
10206
+ "loss": 0.0251,
10207
+ "step": 14510
10208
+ },
10209
+ {
10210
+ "epoch": 5.7008244994110715,
10211
+ "grad_norm": 1.1840918064117432,
10212
+ "learning_rate": 8.321820693666837e-07,
10213
+ "loss": 0.1322,
10214
+ "step": 14520
10215
+ },
10216
+ {
10217
+ "epoch": 5.704750687082843,
10218
+ "grad_norm": 0.09072090685367584,
10219
+ "learning_rate": 8.212753581036865e-07,
10220
+ "loss": 0.1009,
10221
+ "step": 14530
10222
+ },
10223
+ {
10224
+ "epoch": 5.708676874754613,
10225
+ "grad_norm": 28.93404769897461,
10226
+ "learning_rate": 8.103686468406894e-07,
10227
+ "loss": 0.145,
10228
+ "step": 14540
10229
+ },
10230
+ {
10231
+ "epoch": 5.712603062426384,
10232
+ "grad_norm": 1.9189600944519043,
10233
+ "learning_rate": 7.994619355776922e-07,
10234
+ "loss": 0.0692,
10235
+ "step": 14550
10236
+ },
10237
+ {
10238
+ "epoch": 5.716529250098155,
10239
+ "grad_norm": 0.047376301139593124,
10240
+ "learning_rate": 7.88555224314695e-07,
10241
+ "loss": 0.0679,
10242
+ "step": 14560
10243
+ },
10244
+ {
10245
+ "epoch": 5.7204554377699255,
10246
+ "grad_norm": 0.20065166056156158,
10247
+ "learning_rate": 7.776485130516978e-07,
10248
+ "loss": 0.0101,
10249
+ "step": 14570
10250
+ },
10251
+ {
10252
+ "epoch": 5.724381625441696,
10253
+ "grad_norm": 0.5917192697525024,
10254
+ "learning_rate": 7.667418017887007e-07,
10255
+ "loss": 0.0334,
10256
+ "step": 14580
10257
+ },
10258
+ {
10259
+ "epoch": 5.728307813113467,
10260
+ "grad_norm": 75.4139175415039,
10261
+ "learning_rate": 7.558350905257034e-07,
10262
+ "loss": 0.0821,
10263
+ "step": 14590
10264
+ },
10265
+ {
10266
+ "epoch": 5.732234000785238,
10267
+ "grad_norm": 0.0929323062300682,
10268
+ "learning_rate": 7.449283792627063e-07,
10269
+ "loss": 0.053,
10270
+ "step": 14600
10271
+ },
10272
+ {
10273
+ "epoch": 5.736160188457008,
10274
+ "grad_norm": 0.023365721106529236,
10275
+ "learning_rate": 7.340216679997092e-07,
10276
+ "loss": 0.0522,
10277
+ "step": 14610
10278
+ },
10279
+ {
10280
+ "epoch": 5.740086376128779,
10281
+ "grad_norm": 1.7454125881195068,
10282
+ "learning_rate": 7.23114956736712e-07,
10283
+ "loss": 0.0152,
10284
+ "step": 14620
10285
+ },
10286
+ {
10287
+ "epoch": 5.74401256380055,
10288
+ "grad_norm": 48.125389099121094,
10289
+ "learning_rate": 7.122082454737149e-07,
10290
+ "loss": 0.3991,
10291
+ "step": 14630
10292
+ },
10293
+ {
10294
+ "epoch": 5.74793875147232,
10295
+ "grad_norm": 0.10581380128860474,
10296
+ "learning_rate": 7.013015342107177e-07,
10297
+ "loss": 0.0214,
10298
+ "step": 14640
10299
+ },
10300
+ {
10301
+ "epoch": 5.751864939144091,
10302
+ "grad_norm": 0.009025956504046917,
10303
+ "learning_rate": 6.903948229477205e-07,
10304
+ "loss": 0.0195,
10305
+ "step": 14650
10306
+ },
10307
+ {
10308
+ "epoch": 5.755791126815862,
10309
+ "grad_norm": 3.209620237350464,
10310
+ "learning_rate": 6.794881116847233e-07,
10311
+ "loss": 0.0433,
10312
+ "step": 14660
10313
+ },
10314
+ {
10315
+ "epoch": 5.759717314487633,
10316
+ "grad_norm": 0.2999258041381836,
10317
+ "learning_rate": 6.685814004217262e-07,
10318
+ "loss": 0.1324,
10319
+ "step": 14670
10320
+ },
10321
+ {
10322
+ "epoch": 5.763643502159403,
10323
+ "grad_norm": 52.33720779418945,
10324
+ "learning_rate": 6.576746891587289e-07,
10325
+ "loss": 0.2876,
10326
+ "step": 14680
10327
+ },
10328
+ {
10329
+ "epoch": 5.7675696898311735,
10330
+ "grad_norm": 38.36469650268555,
10331
+ "learning_rate": 6.467679778957319e-07,
10332
+ "loss": 0.3568,
10333
+ "step": 14690
10334
+ },
10335
+ {
10336
+ "epoch": 5.771495877502945,
10337
+ "grad_norm": 3.552603244781494,
10338
+ "learning_rate": 6.358612666327347e-07,
10339
+ "loss": 0.2049,
10340
+ "step": 14700
10341
+ },
10342
+ {
10343
+ "epoch": 5.775422065174715,
10344
+ "grad_norm": 66.29668426513672,
10345
+ "learning_rate": 6.249545553697376e-07,
10346
+ "loss": 0.2132,
10347
+ "step": 14710
10348
+ },
10349
+ {
10350
+ "epoch": 5.779348252846486,
10351
+ "grad_norm": 8.237017631530762,
10352
+ "learning_rate": 6.140478441067403e-07,
10353
+ "loss": 0.0869,
10354
+ "step": 14720
10355
+ },
10356
+ {
10357
+ "epoch": 5.783274440518257,
10358
+ "grad_norm": 1.2616140842437744,
10359
+ "learning_rate": 6.031411328437432e-07,
10360
+ "loss": 0.1083,
10361
+ "step": 14730
10362
+ },
10363
+ {
10364
+ "epoch": 5.7872006281900275,
10365
+ "grad_norm": 0.026556458324193954,
10366
+ "learning_rate": 5.922344215807461e-07,
10367
+ "loss": 0.0232,
10368
+ "step": 14740
10369
+ },
10370
+ {
10371
+ "epoch": 5.791126815861798,
10372
+ "grad_norm": 13.759248733520508,
10373
+ "learning_rate": 5.813277103177488e-07,
10374
+ "loss": 0.0617,
10375
+ "step": 14750
10376
+ },
10377
+ {
10378
+ "epoch": 5.795053003533569,
10379
+ "grad_norm": 0.7599974274635315,
10380
+ "learning_rate": 5.704209990547517e-07,
10381
+ "loss": 0.0713,
10382
+ "step": 14760
10383
+ },
10384
+ {
10385
+ "epoch": 5.79897919120534,
10386
+ "grad_norm": 1.4931432008743286,
10387
+ "learning_rate": 5.595142877917546e-07,
10388
+ "loss": 0.0436,
10389
+ "step": 14770
10390
+ },
10391
+ {
10392
+ "epoch": 5.80290537887711,
10393
+ "grad_norm": 1.4563673734664917,
10394
+ "learning_rate": 5.486075765287574e-07,
10395
+ "loss": 0.0414,
10396
+ "step": 14780
10397
+ },
10398
+ {
10399
+ "epoch": 5.806831566548881,
10400
+ "grad_norm": 0.5742218494415283,
10401
+ "learning_rate": 5.377008652657602e-07,
10402
+ "loss": 0.0353,
10403
+ "step": 14790
10404
+ },
10405
+ {
10406
+ "epoch": 5.810757754220652,
10407
+ "grad_norm": 0.47356608510017395,
10408
+ "learning_rate": 5.267941540027631e-07,
10409
+ "loss": 0.0474,
10410
+ "step": 14800
10411
+ },
10412
+ {
10413
+ "epoch": 5.814683941892422,
10414
+ "grad_norm": 42.07039260864258,
10415
+ "learning_rate": 5.158874427397658e-07,
10416
+ "loss": 0.205,
10417
+ "step": 14810
10418
+ },
10419
+ {
10420
+ "epoch": 5.818610129564194,
10421
+ "grad_norm": 0.021612640470266342,
10422
+ "learning_rate": 5.049807314767687e-07,
10423
+ "loss": 0.2927,
10424
+ "step": 14820
10425
+ },
10426
+ {
10427
+ "epoch": 5.822536317235964,
10428
+ "grad_norm": 15.521261215209961,
10429
+ "learning_rate": 4.940740202137715e-07,
10430
+ "loss": 0.1204,
10431
+ "step": 14830
10432
+ },
10433
+ {
10434
+ "epoch": 5.8264625049077345,
10435
+ "grad_norm": 1.3707491159439087,
10436
+ "learning_rate": 4.831673089507743e-07,
10437
+ "loss": 0.0651,
10438
+ "step": 14840
10439
+ },
10440
+ {
10441
+ "epoch": 5.830388692579505,
10442
+ "grad_norm": 1.2329983711242676,
10443
+ "learning_rate": 4.7226059768777727e-07,
10444
+ "loss": 0.1242,
10445
+ "step": 14850
10446
+ },
10447
+ {
10448
+ "epoch": 5.834314880251276,
10449
+ "grad_norm": 39.72314453125,
10450
+ "learning_rate": 4.6135388642478003e-07,
10451
+ "loss": 0.3012,
10452
+ "step": 14860
10453
+ },
10454
+ {
10455
+ "epoch": 5.838241067923047,
10456
+ "grad_norm": 14.552694320678711,
10457
+ "learning_rate": 4.504471751617829e-07,
10458
+ "loss": 0.1662,
10459
+ "step": 14870
10460
+ },
10461
+ {
10462
+ "epoch": 5.842167255594817,
10463
+ "grad_norm": 0.45098528265953064,
10464
+ "learning_rate": 4.395404638987857e-07,
10465
+ "loss": 0.0267,
10466
+ "step": 14880
10467
+ },
10468
+ {
10469
+ "epoch": 5.8460934432665885,
10470
+ "grad_norm": 12.383710861206055,
10471
+ "learning_rate": 4.2863375263578854e-07,
10472
+ "loss": 0.0692,
10473
+ "step": 14890
10474
+ },
10475
+ {
10476
+ "epoch": 5.850019630938359,
10477
+ "grad_norm": 0.05478942394256592,
10478
+ "learning_rate": 4.1772704137279136e-07,
10479
+ "loss": 0.0624,
10480
+ "step": 14900
10481
+ },
10482
+ {
10483
+ "epoch": 5.853945818610129,
10484
+ "grad_norm": 0.025708330795168877,
10485
+ "learning_rate": 4.068203301097943e-07,
10486
+ "loss": 0.239,
10487
+ "step": 14910
10488
+ },
10489
+ {
10490
+ "epoch": 5.8578720062819,
10491
+ "grad_norm": 0.9735015630722046,
10492
+ "learning_rate": 3.959136188467971e-07,
10493
+ "loss": 0.0631,
10494
+ "step": 14920
10495
+ },
10496
+ {
10497
+ "epoch": 5.861798193953671,
10498
+ "grad_norm": 1.240945816040039,
10499
+ "learning_rate": 3.850069075837999e-07,
10500
+ "loss": 0.046,
10501
+ "step": 14930
10502
+ },
10503
+ {
10504
+ "epoch": 5.865724381625442,
10505
+ "grad_norm": 0.002003891160711646,
10506
+ "learning_rate": 3.7410019632080273e-07,
10507
+ "loss": 0.0147,
10508
+ "step": 14940
10509
+ },
10510
+ {
10511
+ "epoch": 5.869650569297212,
10512
+ "grad_norm": Infinity,
10513
+ "learning_rate": 3.631934850578056e-07,
10514
+ "loss": 0.6244,
10515
+ "step": 14950
10516
+ },
10517
+ {
10518
+ "epoch": 5.873576756968983,
10519
+ "grad_norm": 0.04449770227074623,
10520
+ "learning_rate": 3.522867737948084e-07,
10521
+ "loss": 0.0087,
10522
+ "step": 14960
10523
+ },
10524
+ {
10525
+ "epoch": 5.877502944640754,
10526
+ "grad_norm": 0.5825850367546082,
10527
+ "learning_rate": 3.4138006253181124e-07,
10528
+ "loss": 0.0921,
10529
+ "step": 14970
10530
+ },
10531
+ {
10532
+ "epoch": 5.881429132312524,
10533
+ "grad_norm": 2.847191095352173,
10534
+ "learning_rate": 3.3047335126881406e-07,
10535
+ "loss": 0.0513,
10536
+ "step": 14980
10537
+ },
10538
+ {
10539
+ "epoch": 5.885355319984296,
10540
+ "grad_norm": 35.19565200805664,
10541
+ "learning_rate": 3.1956664000581693e-07,
10542
+ "loss": 0.45,
10543
+ "step": 14990
10544
+ },
10545
+ {
10546
+ "epoch": 5.889281507656066,
10547
+ "grad_norm": 53.325225830078125,
10548
+ "learning_rate": 3.0865992874281975e-07,
10549
+ "loss": 0.0693,
10550
+ "step": 15000
10551
+ },
10552
+ {
10553
+ "epoch": 5.8932076953278365,
10554
+ "grad_norm": 15.387211799621582,
10555
+ "learning_rate": 2.9775321747982256e-07,
10556
+ "loss": 0.1537,
10557
+ "step": 15010
10558
+ },
10559
+ {
10560
+ "epoch": 5.897133882999608,
10561
+ "grad_norm": 6.742716312408447,
10562
+ "learning_rate": 2.868465062168254e-07,
10563
+ "loss": 0.1971,
10564
+ "step": 15020
10565
+ },
10566
+ {
10567
+ "epoch": 5.901060070671378,
10568
+ "grad_norm": 0.0015198083128780127,
10569
+ "learning_rate": 2.759397949538283e-07,
10570
+ "loss": 0.2437,
10571
+ "step": 15030
10572
+ },
10573
+ {
10574
+ "epoch": 5.904986258343149,
10575
+ "grad_norm": 1.0140429735183716,
10576
+ "learning_rate": 2.650330836908311e-07,
10577
+ "loss": 0.0108,
10578
+ "step": 15040
10579
+ },
10580
+ {
10581
+ "epoch": 5.908912446014919,
10582
+ "grad_norm": 0.36821112036705017,
10583
+ "learning_rate": 2.5412637242783394e-07,
10584
+ "loss": 0.1091,
10585
+ "step": 15050
10586
+ },
10587
+ {
10588
+ "epoch": 5.9128386336866905,
10589
+ "grad_norm": 63.82198715209961,
10590
+ "learning_rate": 2.4321966116483676e-07,
10591
+ "loss": 0.3013,
10592
+ "step": 15060
10593
+ },
10594
+ {
10595
+ "epoch": 5.916764821358461,
10596
+ "grad_norm": 3.619304895401001,
10597
+ "learning_rate": 2.323129499018396e-07,
10598
+ "loss": 0.0495,
10599
+ "step": 15070
10600
+ },
10601
+ {
10602
+ "epoch": 5.920691009030231,
10603
+ "grad_norm": 1.9223240613937378,
10604
+ "learning_rate": 2.2140623863884245e-07,
10605
+ "loss": 0.0587,
10606
+ "step": 15080
10607
+ },
10608
+ {
10609
+ "epoch": 5.924617196702003,
10610
+ "grad_norm": 0.054315753281116486,
10611
+ "learning_rate": 2.1049952737584526e-07,
10612
+ "loss": 0.1429,
10613
+ "step": 15090
10614
+ },
10615
+ {
10616
+ "epoch": 5.928543384373773,
10617
+ "grad_norm": 0.6880110502243042,
10618
+ "learning_rate": 1.995928161128481e-07,
10619
+ "loss": 0.0702,
10620
+ "step": 15100
10621
+ },
10622
+ {
10623
+ "epoch": 5.932469572045544,
10624
+ "grad_norm": 1.5579729080200195,
10625
+ "learning_rate": 1.8868610484985093e-07,
10626
+ "loss": 0.0362,
10627
+ "step": 15110
10628
+ },
10629
+ {
10630
+ "epoch": 5.936395759717314,
10631
+ "grad_norm": 7.767218112945557,
10632
+ "learning_rate": 1.777793935868538e-07,
10633
+ "loss": 0.1695,
10634
+ "step": 15120
10635
+ },
10636
+ {
10637
+ "epoch": 5.940321947389085,
10638
+ "grad_norm": 2.507051706314087,
10639
+ "learning_rate": 1.6687268232385661e-07,
10640
+ "loss": 0.092,
10641
+ "step": 15130
10642
+ },
10643
+ {
10644
+ "epoch": 5.944248135060856,
10645
+ "grad_norm": 5.474608421325684,
10646
+ "learning_rate": 1.5596597106085946e-07,
10647
+ "loss": 0.0219,
10648
+ "step": 15140
10649
+ },
10650
+ {
10651
+ "epoch": 5.948174322732626,
10652
+ "grad_norm": 2.6450958251953125,
10653
+ "learning_rate": 1.4505925979786228e-07,
10654
+ "loss": 0.0861,
10655
+ "step": 15150
10656
+ },
10657
+ {
10658
+ "epoch": 5.952100510404398,
10659
+ "grad_norm": 0.38818231225013733,
10660
+ "learning_rate": 1.3415254853486512e-07,
10661
+ "loss": 0.023,
10662
+ "step": 15160
10663
+ },
10664
+ {
10665
+ "epoch": 5.956026698076168,
10666
+ "grad_norm": 3.4643454551696777,
10667
+ "learning_rate": 1.2324583727186794e-07,
10668
+ "loss": 0.1185,
10669
+ "step": 15170
10670
+ },
10671
+ {
10672
+ "epoch": 5.9599528857479385,
10673
+ "grad_norm": 0.08205350488424301,
10674
+ "learning_rate": 1.123391260088708e-07,
10675
+ "loss": 0.0154,
10676
+ "step": 15180
10677
+ },
10678
+ {
10679
+ "epoch": 5.96387907341971,
10680
+ "grad_norm": 19.57013702392578,
10681
+ "learning_rate": 1.0143241474587364e-07,
10682
+ "loss": 0.1135,
10683
+ "step": 15190
10684
+ },
10685
+ {
10686
+ "epoch": 5.96780526109148,
10687
+ "grad_norm": 0.3869020938873291,
10688
+ "learning_rate": 9.052570348287647e-08,
10689
+ "loss": 0.1303,
10690
+ "step": 15200
10691
+ },
10692
+ {
10693
+ "epoch": 5.971731448763251,
10694
+ "grad_norm": 0.9908976554870605,
10695
+ "learning_rate": 7.96189922198793e-08,
10696
+ "loss": 0.2838,
10697
+ "step": 15210
10698
+ },
10699
+ {
10700
+ "epoch": 5.975657636435022,
10701
+ "grad_norm": 0.01841503009200096,
10702
+ "learning_rate": 6.871228095688215e-08,
10703
+ "loss": 0.2122,
10704
+ "step": 15220
10705
+ },
10706
+ {
10707
+ "epoch": 5.9795838241067925,
10708
+ "grad_norm": 48.305274963378906,
10709
+ "learning_rate": 5.780556969388497e-08,
10710
+ "loss": 0.0936,
10711
+ "step": 15230
10712
+ },
10713
+ {
10714
+ "epoch": 5.983510011778563,
10715
+ "grad_norm": 3.678730010986328,
10716
+ "learning_rate": 4.689885843088781e-08,
10717
+ "loss": 0.0215,
10718
+ "step": 15240
10719
+ },
10720
+ {
10721
+ "epoch": 5.987436199450333,
10722
+ "grad_norm": 0.3298221826553345,
10723
+ "learning_rate": 3.599214716789064e-08,
10724
+ "loss": 0.1976,
10725
+ "step": 15250
10726
+ },
10727
+ {
10728
+ "epoch": 5.991362387122105,
10729
+ "grad_norm": 4.3517255783081055,
10730
+ "learning_rate": 2.508543590489348e-08,
10731
+ "loss": 0.0784,
10732
+ "step": 15260
10733
+ },
10734
+ {
10735
+ "epoch": 5.995288574793875,
10736
+ "grad_norm": 0.4186241924762726,
10737
+ "learning_rate": 1.4178724641896315e-08,
10738
+ "loss": 0.2141,
10739
+ "step": 15270
10740
+ },
10741
+ {
10742
+ "epoch": 5.999214762465646,
10743
+ "grad_norm": 2.5911920070648193,
10744
+ "learning_rate": 3.272013378899149e-09,
10745
+ "loss": 0.0982,
10746
+ "step": 15280
10747
+ },
10748
+ {
10749
+ "epoch": 6.0,
10750
+ "eval_loss": 0.19693690538406372,
10751
+ "eval_runtime": 11.3325,
10752
+ "eval_samples_per_second": 199.78,
10753
+ "eval_steps_per_second": 24.972,
10754
+ "step": 15282
10755
  }
10756
  ],
10757
  "logging_steps": 10,
 
10766
  "early_stopping_threshold": 0.0
10767
  },
10768
  "attributes": {
10769
+ "early_stopping_patience_counter": 0
10770
  }
10771
  },
10772
  "TrainerControl": {
 
10775
  "should_evaluate": false,
10776
  "should_log": false,
10777
  "should_save": true,
10778
+ "should_training_stop": true
10779
  },
10780
  "attributes": {}
10781
  }
10782
  },
10783
+ "total_flos": 3.4143722700698976e+16,
10784
  "train_batch_size": 8,
10785
  "trial_name": null,
10786
  "trial_params": null