n0w0f commited on
Commit
c540d1c
·
verified ·
1 Parent(s): a710b50

Upload folder using huggingface_hub

Browse files
config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertForMaskedLM"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "classifier_dropout": null,
7
+ "dtype": "float32",
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 512,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-12,
15
+ "max_position_embeddings": 1024,
16
+ "model_type": "bert",
17
+ "num_attention_heads": 8,
18
+ "num_hidden_layers": 4,
19
+ "pad_token_id": 0,
20
+ "position_embedding_type": "absolute",
21
+ "transformers_version": "4.57.6",
22
+ "type_vocab_size": 2,
23
+ "use_cache": true,
24
+ "vocab_size": 30522
25
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c48972ed56f41bc6850e69deb527e98193d23b540cf19aac207fb72e991efd02
3
+ size 133031496
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:acf84ed9905aecbeec8c15959433d21386c70135f7b8a92f08f69d84b70a2325
3
+ size 266109515
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd2b6e658410dcd7def6a5026dcc483109a654719c408d809f1efb4f52967be0
3
+ size 14645
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb99b5a4ab1433a51efe856b4ea1e15de233cd524729b809a7ab2bddd71c4e90
3
+ size 1465
special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "[BOS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "[CLS]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "[EOS]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "[MASK]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "[PAD]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "[SEP]",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "[UNK]",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[UNK]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[CLS]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[SEP]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[PAD]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "30001": {
44
+ "content": "[EOS]",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ },
51
+ "30002": {
52
+ "content": "[BOS]",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
+ }
59
+ },
60
+ "bos_token": "[BOS]",
61
+ "clean_up_tokenization_spaces": false,
62
+ "cls_token": "[CLS]",
63
+ "eos_token": "[EOS]",
64
+ "extra_special_tokens": {},
65
+ "mask_token": "[MASK]",
66
+ "model_max_length": 1000000000000000019884624838656,
67
+ "pad_token": "[PAD]",
68
+ "sep_token": "[SEP]",
69
+ "tokenizer_class": "PreTrainedTokenizerFast",
70
+ "unk_token": "[UNK]"
71
+ }
trainer_state.json ADDED
@@ -0,0 +1,3634 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 12000,
3
+ "best_metric": 0.12119368463754654,
4
+ "best_model_checkpoint": "/data/alamparan/mattext_ckpt/results/2026-02-04/03-04-44/pretrain/checkpoints/robocrys_rep_test-pretrain/checkpoint-12000",
5
+ "epoch": 38.70967741935484,
6
+ "eval_steps": 50,
7
+ "global_step": 12000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.16129032258064516,
14
+ "grad_norm": 1.4281898736953735,
15
+ "learning_rate": 0.00019936774193548388,
16
+ "loss": 6.0329,
17
+ "step": 50
18
+ },
19
+ {
20
+ "epoch": 0.16129032258064516,
21
+ "eval_loss": 4.073571681976318,
22
+ "eval_runtime": 86.5227,
23
+ "eval_samples_per_second": 219.642,
24
+ "eval_steps_per_second": 4.577,
25
+ "step": 50
26
+ },
27
+ {
28
+ "epoch": 0.3225806451612903,
29
+ "grad_norm": 1.1555994749069214,
30
+ "learning_rate": 0.0001987225806451613,
31
+ "loss": 3.8465,
32
+ "step": 100
33
+ },
34
+ {
35
+ "epoch": 0.3225806451612903,
36
+ "eval_loss": 3.6210274696350098,
37
+ "eval_runtime": 87.4418,
38
+ "eval_samples_per_second": 217.333,
39
+ "eval_steps_per_second": 4.529,
40
+ "step": 100
41
+ },
42
+ {
43
+ "epoch": 0.4838709677419355,
44
+ "grad_norm": 1.2039262056350708,
45
+ "learning_rate": 0.00019807741935483873,
46
+ "loss": 3.5783,
47
+ "step": 150
48
+ },
49
+ {
50
+ "epoch": 0.4838709677419355,
51
+ "eval_loss": 3.4356088638305664,
52
+ "eval_runtime": 86.395,
53
+ "eval_samples_per_second": 219.966,
54
+ "eval_steps_per_second": 4.584,
55
+ "step": 150
56
+ },
57
+ {
58
+ "epoch": 0.6451612903225806,
59
+ "grad_norm": 0.8887826204299927,
60
+ "learning_rate": 0.00019743225806451612,
61
+ "loss": 3.411,
62
+ "step": 200
63
+ },
64
+ {
65
+ "epoch": 0.6451612903225806,
66
+ "eval_loss": 3.294156074523926,
67
+ "eval_runtime": 89.1633,
68
+ "eval_samples_per_second": 213.137,
69
+ "eval_steps_per_second": 4.441,
70
+ "step": 200
71
+ },
72
+ {
73
+ "epoch": 0.8064516129032258,
74
+ "grad_norm": 0.9355249404907227,
75
+ "learning_rate": 0.00019678709677419356,
76
+ "loss": 3.3012,
77
+ "step": 250
78
+ },
79
+ {
80
+ "epoch": 0.8064516129032258,
81
+ "eval_loss": 3.1778066158294678,
82
+ "eval_runtime": 87.9303,
83
+ "eval_samples_per_second": 216.126,
84
+ "eval_steps_per_second": 4.504,
85
+ "step": 250
86
+ },
87
+ {
88
+ "epoch": 0.967741935483871,
89
+ "grad_norm": 0.9125154614448547,
90
+ "learning_rate": 0.000196141935483871,
91
+ "loss": 3.1839,
92
+ "step": 300
93
+ },
94
+ {
95
+ "epoch": 0.967741935483871,
96
+ "eval_loss": 3.0771267414093018,
97
+ "eval_runtime": 88.1645,
98
+ "eval_samples_per_second": 215.552,
99
+ "eval_steps_per_second": 4.492,
100
+ "step": 300
101
+ },
102
+ {
103
+ "epoch": 1.129032258064516,
104
+ "grad_norm": 1.038801670074463,
105
+ "learning_rate": 0.00019549677419354838,
106
+ "loss": 3.0949,
107
+ "step": 350
108
+ },
109
+ {
110
+ "epoch": 1.129032258064516,
111
+ "eval_loss": 2.972693920135498,
112
+ "eval_runtime": 87.3627,
113
+ "eval_samples_per_second": 217.53,
114
+ "eval_steps_per_second": 4.533,
115
+ "step": 350
116
+ },
117
+ {
118
+ "epoch": 1.2903225806451613,
119
+ "grad_norm": 0.9201057553291321,
120
+ "learning_rate": 0.00019485161290322582,
121
+ "loss": 3.0067,
122
+ "step": 400
123
+ },
124
+ {
125
+ "epoch": 1.2903225806451613,
126
+ "eval_loss": 2.8708949089050293,
127
+ "eval_runtime": 86.2708,
128
+ "eval_samples_per_second": 220.283,
129
+ "eval_steps_per_second": 4.59,
130
+ "step": 400
131
+ },
132
+ {
133
+ "epoch": 1.4516129032258065,
134
+ "grad_norm": 1.1271328926086426,
135
+ "learning_rate": 0.00019420645161290323,
136
+ "loss": 2.8894,
137
+ "step": 450
138
+ },
139
+ {
140
+ "epoch": 1.4516129032258065,
141
+ "eval_loss": 2.741400957107544,
142
+ "eval_runtime": 88.3865,
143
+ "eval_samples_per_second": 215.01,
144
+ "eval_steps_per_second": 4.48,
145
+ "step": 450
146
+ },
147
+ {
148
+ "epoch": 1.6129032258064515,
149
+ "grad_norm": 1.6507548093795776,
150
+ "learning_rate": 0.00019356129032258065,
151
+ "loss": 2.7396,
152
+ "step": 500
153
+ },
154
+ {
155
+ "epoch": 1.6129032258064515,
156
+ "eval_loss": 2.5605974197387695,
157
+ "eval_runtime": 88.4696,
158
+ "eval_samples_per_second": 214.808,
159
+ "eval_steps_per_second": 4.476,
160
+ "step": 500
161
+ },
162
+ {
163
+ "epoch": 1.7741935483870968,
164
+ "grad_norm": 1.5166431665420532,
165
+ "learning_rate": 0.00019291612903225806,
166
+ "loss": 2.5539,
167
+ "step": 550
168
+ },
169
+ {
170
+ "epoch": 1.7741935483870968,
171
+ "eval_loss": 2.2101876735687256,
172
+ "eval_runtime": 88.7721,
173
+ "eval_samples_per_second": 214.076,
174
+ "eval_steps_per_second": 4.461,
175
+ "step": 550
176
+ },
177
+ {
178
+ "epoch": 1.935483870967742,
179
+ "grad_norm": 1.8889034986495972,
180
+ "learning_rate": 0.0001922709677419355,
181
+ "loss": 2.043,
182
+ "step": 600
183
+ },
184
+ {
185
+ "epoch": 1.935483870967742,
186
+ "eval_loss": 1.4450961351394653,
187
+ "eval_runtime": 87.3615,
188
+ "eval_samples_per_second": 217.533,
189
+ "eval_steps_per_second": 4.533,
190
+ "step": 600
191
+ },
192
+ {
193
+ "epoch": 2.096774193548387,
194
+ "grad_norm": 1.4880077838897705,
195
+ "learning_rate": 0.0001916258064516129,
196
+ "loss": 1.4473,
197
+ "step": 650
198
+ },
199
+ {
200
+ "epoch": 2.096774193548387,
201
+ "eval_loss": 1.047796607017517,
202
+ "eval_runtime": 88.4068,
203
+ "eval_samples_per_second": 214.961,
204
+ "eval_steps_per_second": 4.479,
205
+ "step": 650
206
+ },
207
+ {
208
+ "epoch": 2.258064516129032,
209
+ "grad_norm": 1.2545322179794312,
210
+ "learning_rate": 0.00019098064516129032,
211
+ "loss": 1.1162,
212
+ "step": 700
213
+ },
214
+ {
215
+ "epoch": 2.258064516129032,
216
+ "eval_loss": 0.8680551052093506,
217
+ "eval_runtime": 89.0396,
218
+ "eval_samples_per_second": 213.433,
219
+ "eval_steps_per_second": 4.447,
220
+ "step": 700
221
+ },
222
+ {
223
+ "epoch": 2.4193548387096775,
224
+ "grad_norm": 1.1532652378082275,
225
+ "learning_rate": 0.00019033548387096776,
226
+ "loss": 0.9731,
227
+ "step": 750
228
+ },
229
+ {
230
+ "epoch": 2.4193548387096775,
231
+ "eval_loss": 0.785129964351654,
232
+ "eval_runtime": 90.5213,
233
+ "eval_samples_per_second": 209.939,
234
+ "eval_steps_per_second": 4.375,
235
+ "step": 750
236
+ },
237
+ {
238
+ "epoch": 2.5806451612903225,
239
+ "grad_norm": 0.8688052892684937,
240
+ "learning_rate": 0.00018969032258064517,
241
+ "loss": 0.8907,
242
+ "step": 800
243
+ },
244
+ {
245
+ "epoch": 2.5806451612903225,
246
+ "eval_loss": 0.7259724140167236,
247
+ "eval_runtime": 88.9062,
248
+ "eval_samples_per_second": 213.753,
249
+ "eval_steps_per_second": 4.454,
250
+ "step": 800
251
+ },
252
+ {
253
+ "epoch": 2.741935483870968,
254
+ "grad_norm": 1.1574801206588745,
255
+ "learning_rate": 0.00018904516129032259,
256
+ "loss": 0.8301,
257
+ "step": 850
258
+ },
259
+ {
260
+ "epoch": 2.741935483870968,
261
+ "eval_loss": 0.6951683163642883,
262
+ "eval_runtime": 89.0056,
263
+ "eval_samples_per_second": 213.515,
264
+ "eval_steps_per_second": 4.449,
265
+ "step": 850
266
+ },
267
+ {
268
+ "epoch": 2.903225806451613,
269
+ "grad_norm": 1.0460095405578613,
270
+ "learning_rate": 0.0001884,
271
+ "loss": 0.7883,
272
+ "step": 900
273
+ },
274
+ {
275
+ "epoch": 2.903225806451613,
276
+ "eval_loss": 0.6605275869369507,
277
+ "eval_runtime": 87.0532,
278
+ "eval_samples_per_second": 218.303,
279
+ "eval_steps_per_second": 4.549,
280
+ "step": 900
281
+ },
282
+ {
283
+ "epoch": 3.064516129032258,
284
+ "grad_norm": 0.7760242223739624,
285
+ "learning_rate": 0.00018775483870967744,
286
+ "loss": 0.7428,
287
+ "step": 950
288
+ },
289
+ {
290
+ "epoch": 3.064516129032258,
291
+ "eval_loss": 0.6210550665855408,
292
+ "eval_runtime": 87.5365,
293
+ "eval_samples_per_second": 217.098,
294
+ "eval_steps_per_second": 4.524,
295
+ "step": 950
296
+ },
297
+ {
298
+ "epoch": 3.225806451612903,
299
+ "grad_norm": 0.7606909275054932,
300
+ "learning_rate": 0.00018710967741935485,
301
+ "loss": 0.7159,
302
+ "step": 1000
303
+ },
304
+ {
305
+ "epoch": 3.225806451612903,
306
+ "eval_loss": 0.60401451587677,
307
+ "eval_runtime": 86.4168,
308
+ "eval_samples_per_second": 219.911,
309
+ "eval_steps_per_second": 4.582,
310
+ "step": 1000
311
+ },
312
+ {
313
+ "epoch": 3.3870967741935485,
314
+ "grad_norm": 0.7674448490142822,
315
+ "learning_rate": 0.00018646451612903226,
316
+ "loss": 0.6965,
317
+ "step": 1050
318
+ },
319
+ {
320
+ "epoch": 3.3870967741935485,
321
+ "eval_loss": 0.583739697933197,
322
+ "eval_runtime": 87.6612,
323
+ "eval_samples_per_second": 216.789,
324
+ "eval_steps_per_second": 4.517,
325
+ "step": 1050
326
+ },
327
+ {
328
+ "epoch": 3.5483870967741935,
329
+ "grad_norm": 0.8499358296394348,
330
+ "learning_rate": 0.0001858193548387097,
331
+ "loss": 0.6706,
332
+ "step": 1100
333
+ },
334
+ {
335
+ "epoch": 3.5483870967741935,
336
+ "eval_loss": 0.567313551902771,
337
+ "eval_runtime": 84.0187,
338
+ "eval_samples_per_second": 226.188,
339
+ "eval_steps_per_second": 4.713,
340
+ "step": 1100
341
+ },
342
+ {
343
+ "epoch": 3.709677419354839,
344
+ "grad_norm": 0.8062635660171509,
345
+ "learning_rate": 0.0001851741935483871,
346
+ "loss": 0.6605,
347
+ "step": 1150
348
+ },
349
+ {
350
+ "epoch": 3.709677419354839,
351
+ "eval_loss": 0.5546169281005859,
352
+ "eval_runtime": 87.6004,
353
+ "eval_samples_per_second": 216.94,
354
+ "eval_steps_per_second": 4.521,
355
+ "step": 1150
356
+ },
357
+ {
358
+ "epoch": 3.870967741935484,
359
+ "grad_norm": 0.9007663726806641,
360
+ "learning_rate": 0.00018452903225806453,
361
+ "loss": 0.6528,
362
+ "step": 1200
363
+ },
364
+ {
365
+ "epoch": 3.870967741935484,
366
+ "eval_loss": 0.5342397093772888,
367
+ "eval_runtime": 90.2247,
368
+ "eval_samples_per_second": 210.63,
369
+ "eval_steps_per_second": 4.389,
370
+ "step": 1200
371
+ },
372
+ {
373
+ "epoch": 4.032258064516129,
374
+ "grad_norm": 0.6880891919136047,
375
+ "learning_rate": 0.00018388387096774194,
376
+ "loss": 0.6174,
377
+ "step": 1250
378
+ },
379
+ {
380
+ "epoch": 4.032258064516129,
381
+ "eval_loss": 0.5208889245986938,
382
+ "eval_runtime": 94.9972,
383
+ "eval_samples_per_second": 200.048,
384
+ "eval_steps_per_second": 4.169,
385
+ "step": 1250
386
+ },
387
+ {
388
+ "epoch": 4.193548387096774,
389
+ "grad_norm": 0.708258330821991,
390
+ "learning_rate": 0.00018323870967741935,
391
+ "loss": 0.5884,
392
+ "step": 1300
393
+ },
394
+ {
395
+ "epoch": 4.193548387096774,
396
+ "eval_loss": 0.5109750032424927,
397
+ "eval_runtime": 94.4447,
398
+ "eval_samples_per_second": 201.218,
399
+ "eval_steps_per_second": 4.193,
400
+ "step": 1300
401
+ },
402
+ {
403
+ "epoch": 4.354838709677419,
404
+ "grad_norm": 0.5771231651306152,
405
+ "learning_rate": 0.0001825935483870968,
406
+ "loss": 0.5515,
407
+ "step": 1350
408
+ },
409
+ {
410
+ "epoch": 4.354838709677419,
411
+ "eval_loss": 0.5005462169647217,
412
+ "eval_runtime": 92.4886,
413
+ "eval_samples_per_second": 205.474,
414
+ "eval_steps_per_second": 4.282,
415
+ "step": 1350
416
+ },
417
+ {
418
+ "epoch": 4.516129032258064,
419
+ "grad_norm": 0.9058783054351807,
420
+ "learning_rate": 0.0001819483870967742,
421
+ "loss": 0.5876,
422
+ "step": 1400
423
+ },
424
+ {
425
+ "epoch": 4.516129032258064,
426
+ "eval_loss": 0.4874018728733063,
427
+ "eval_runtime": 90.1408,
428
+ "eval_samples_per_second": 210.826,
429
+ "eval_steps_per_second": 4.393,
430
+ "step": 1400
431
+ },
432
+ {
433
+ "epoch": 4.67741935483871,
434
+ "grad_norm": 0.702014148235321,
435
+ "learning_rate": 0.00018130322580645162,
436
+ "loss": 0.5564,
437
+ "step": 1450
438
+ },
439
+ {
440
+ "epoch": 4.67741935483871,
441
+ "eval_loss": 0.47600802779197693,
442
+ "eval_runtime": 89.1009,
443
+ "eval_samples_per_second": 213.286,
444
+ "eval_steps_per_second": 4.444,
445
+ "step": 1450
446
+ },
447
+ {
448
+ "epoch": 4.838709677419355,
449
+ "grad_norm": 0.6843072772026062,
450
+ "learning_rate": 0.00018065806451612903,
451
+ "loss": 0.5857,
452
+ "step": 1500
453
+ },
454
+ {
455
+ "epoch": 4.838709677419355,
456
+ "eval_loss": 0.45552995800971985,
457
+ "eval_runtime": 87.6377,
458
+ "eval_samples_per_second": 216.847,
459
+ "eval_steps_per_second": 4.519,
460
+ "step": 1500
461
+ },
462
+ {
463
+ "epoch": 5.0,
464
+ "grad_norm": 0.9617322683334351,
465
+ "learning_rate": 0.00018001290322580647,
466
+ "loss": 0.5225,
467
+ "step": 1550
468
+ },
469
+ {
470
+ "epoch": 5.0,
471
+ "eval_loss": 0.448142945766449,
472
+ "eval_runtime": 88.2995,
473
+ "eval_samples_per_second": 215.222,
474
+ "eval_steps_per_second": 4.485,
475
+ "step": 1550
476
+ },
477
+ {
478
+ "epoch": 5.161290322580645,
479
+ "grad_norm": 0.5962368249893188,
480
+ "learning_rate": 0.00017936774193548388,
481
+ "loss": 0.5237,
482
+ "step": 1600
483
+ },
484
+ {
485
+ "epoch": 5.161290322580645,
486
+ "eval_loss": 0.4412171542644501,
487
+ "eval_runtime": 86.7486,
488
+ "eval_samples_per_second": 219.07,
489
+ "eval_steps_per_second": 4.565,
490
+ "step": 1600
491
+ },
492
+ {
493
+ "epoch": 5.32258064516129,
494
+ "grad_norm": 0.6392622590065002,
495
+ "learning_rate": 0.0001787225806451613,
496
+ "loss": 0.5279,
497
+ "step": 1650
498
+ },
499
+ {
500
+ "epoch": 5.32258064516129,
501
+ "eval_loss": 0.43270382285118103,
502
+ "eval_runtime": 86.3243,
503
+ "eval_samples_per_second": 220.147,
504
+ "eval_steps_per_second": 4.587,
505
+ "step": 1650
506
+ },
507
+ {
508
+ "epoch": 5.483870967741936,
509
+ "grad_norm": 0.5868324637413025,
510
+ "learning_rate": 0.00017807741935483873,
511
+ "loss": 0.4941,
512
+ "step": 1700
513
+ },
514
+ {
515
+ "epoch": 5.483870967741936,
516
+ "eval_loss": 0.42134207487106323,
517
+ "eval_runtime": 87.3073,
518
+ "eval_samples_per_second": 217.668,
519
+ "eval_steps_per_second": 4.536,
520
+ "step": 1700
521
+ },
522
+ {
523
+ "epoch": 5.645161290322581,
524
+ "grad_norm": 0.6335242986679077,
525
+ "learning_rate": 0.00017743225806451615,
526
+ "loss": 0.4771,
527
+ "step": 1750
528
+ },
529
+ {
530
+ "epoch": 5.645161290322581,
531
+ "eval_loss": 0.4060479998588562,
532
+ "eval_runtime": 86.2245,
533
+ "eval_samples_per_second": 220.402,
534
+ "eval_steps_per_second": 4.593,
535
+ "step": 1750
536
+ },
537
+ {
538
+ "epoch": 5.806451612903226,
539
+ "grad_norm": 0.6034550666809082,
540
+ "learning_rate": 0.00017678709677419356,
541
+ "loss": 0.4411,
542
+ "step": 1800
543
+ },
544
+ {
545
+ "epoch": 5.806451612903226,
546
+ "eval_loss": 0.39572522044181824,
547
+ "eval_runtime": 86.9597,
548
+ "eval_samples_per_second": 218.538,
549
+ "eval_steps_per_second": 4.554,
550
+ "step": 1800
551
+ },
552
+ {
553
+ "epoch": 5.967741935483871,
554
+ "grad_norm": 0.6557337045669556,
555
+ "learning_rate": 0.00017614193548387097,
556
+ "loss": 0.4511,
557
+ "step": 1850
558
+ },
559
+ {
560
+ "epoch": 5.967741935483871,
561
+ "eval_loss": 0.3814023435115814,
562
+ "eval_runtime": 88.9636,
563
+ "eval_samples_per_second": 213.615,
564
+ "eval_steps_per_second": 4.451,
565
+ "step": 1850
566
+ },
567
+ {
568
+ "epoch": 6.129032258064516,
569
+ "grad_norm": 0.510995090007782,
570
+ "learning_rate": 0.0001754967741935484,
571
+ "loss": 0.4288,
572
+ "step": 1900
573
+ },
574
+ {
575
+ "epoch": 6.129032258064516,
576
+ "eval_loss": 0.3621025085449219,
577
+ "eval_runtime": 92.0174,
578
+ "eval_samples_per_second": 206.526,
579
+ "eval_steps_per_second": 4.304,
580
+ "step": 1900
581
+ },
582
+ {
583
+ "epoch": 6.290322580645161,
584
+ "grad_norm": 0.644413948059082,
585
+ "learning_rate": 0.00017485161290322582,
586
+ "loss": 0.4234,
587
+ "step": 1950
588
+ },
589
+ {
590
+ "epoch": 6.290322580645161,
591
+ "eval_loss": 0.35021910071372986,
592
+ "eval_runtime": 95.7807,
593
+ "eval_samples_per_second": 198.411,
594
+ "eval_steps_per_second": 4.134,
595
+ "step": 1950
596
+ },
597
+ {
598
+ "epoch": 6.451612903225806,
599
+ "grad_norm": 0.5785158276557922,
600
+ "learning_rate": 0.00017420645161290323,
601
+ "loss": 0.4009,
602
+ "step": 2000
603
+ },
604
+ {
605
+ "epoch": 6.451612903225806,
606
+ "eval_loss": 0.33460476994514465,
607
+ "eval_runtime": 146.86,
608
+ "eval_samples_per_second": 129.402,
609
+ "eval_steps_per_second": 2.696,
610
+ "step": 2000
611
+ },
612
+ {
613
+ "epoch": 6.612903225806452,
614
+ "grad_norm": 0.7037348747253418,
615
+ "learning_rate": 0.00017356129032258067,
616
+ "loss": 0.3922,
617
+ "step": 2050
618
+ },
619
+ {
620
+ "epoch": 6.612903225806452,
621
+ "eval_loss": 0.3163394331932068,
622
+ "eval_runtime": 179.0958,
623
+ "eval_samples_per_second": 106.111,
624
+ "eval_steps_per_second": 2.211,
625
+ "step": 2050
626
+ },
627
+ {
628
+ "epoch": 6.774193548387097,
629
+ "grad_norm": 0.6514049768447876,
630
+ "learning_rate": 0.00017291612903225806,
631
+ "loss": 0.3524,
632
+ "step": 2100
633
+ },
634
+ {
635
+ "epoch": 6.774193548387097,
636
+ "eval_loss": 0.30841001868247986,
637
+ "eval_runtime": 179.6968,
638
+ "eval_samples_per_second": 105.756,
639
+ "eval_steps_per_second": 2.204,
640
+ "step": 2100
641
+ },
642
+ {
643
+ "epoch": 6.935483870967742,
644
+ "grad_norm": 0.6443042755126953,
645
+ "learning_rate": 0.0001722709677419355,
646
+ "loss": 0.3346,
647
+ "step": 2150
648
+ },
649
+ {
650
+ "epoch": 6.935483870967742,
651
+ "eval_loss": 0.2951297163963318,
652
+ "eval_runtime": 180.7736,
653
+ "eval_samples_per_second": 105.126,
654
+ "eval_steps_per_second": 2.191,
655
+ "step": 2150
656
+ },
657
+ {
658
+ "epoch": 7.096774193548387,
659
+ "grad_norm": 0.6390765309333801,
660
+ "learning_rate": 0.0001716258064516129,
661
+ "loss": 0.3255,
662
+ "step": 2200
663
+ },
664
+ {
665
+ "epoch": 7.096774193548387,
666
+ "eval_loss": 0.2864570915699005,
667
+ "eval_runtime": 177.5655,
668
+ "eval_samples_per_second": 107.025,
669
+ "eval_steps_per_second": 2.23,
670
+ "step": 2200
671
+ },
672
+ {
673
+ "epoch": 7.258064516129032,
674
+ "grad_norm": 0.6152281165122986,
675
+ "learning_rate": 0.00017098064516129032,
676
+ "loss": 0.3258,
677
+ "step": 2250
678
+ },
679
+ {
680
+ "epoch": 7.258064516129032,
681
+ "eval_loss": 0.26722875237464905,
682
+ "eval_runtime": 162.1355,
683
+ "eval_samples_per_second": 117.211,
684
+ "eval_steps_per_second": 2.442,
685
+ "step": 2250
686
+ },
687
+ {
688
+ "epoch": 7.419354838709677,
689
+ "grad_norm": 0.5493067502975464,
690
+ "learning_rate": 0.00017033548387096776,
691
+ "loss": 0.3071,
692
+ "step": 2300
693
+ },
694
+ {
695
+ "epoch": 7.419354838709677,
696
+ "eval_loss": 0.2526390552520752,
697
+ "eval_runtime": 200.1286,
698
+ "eval_samples_per_second": 94.959,
699
+ "eval_steps_per_second": 1.979,
700
+ "step": 2300
701
+ },
702
+ {
703
+ "epoch": 7.580645161290323,
704
+ "grad_norm": 0.5964747667312622,
705
+ "learning_rate": 0.00016969032258064518,
706
+ "loss": 0.288,
707
+ "step": 2350
708
+ },
709
+ {
710
+ "epoch": 7.580645161290323,
711
+ "eval_loss": 0.24554206430912018,
712
+ "eval_runtime": 275.0797,
713
+ "eval_samples_per_second": 69.085,
714
+ "eval_steps_per_second": 1.44,
715
+ "step": 2350
716
+ },
717
+ {
718
+ "epoch": 7.741935483870968,
719
+ "grad_norm": 0.5316987037658691,
720
+ "learning_rate": 0.0001690451612903226,
721
+ "loss": 0.2739,
722
+ "step": 2400
723
+ },
724
+ {
725
+ "epoch": 7.741935483870968,
726
+ "eval_loss": 0.23691914975643158,
727
+ "eval_runtime": 175.1655,
728
+ "eval_samples_per_second": 108.492,
729
+ "eval_steps_per_second": 2.261,
730
+ "step": 2400
731
+ },
732
+ {
733
+ "epoch": 7.903225806451613,
734
+ "grad_norm": 0.47612282633781433,
735
+ "learning_rate": 0.0001684,
736
+ "loss": 0.2706,
737
+ "step": 2450
738
+ },
739
+ {
740
+ "epoch": 7.903225806451613,
741
+ "eval_loss": 0.23088718950748444,
742
+ "eval_runtime": 174.1382,
743
+ "eval_samples_per_second": 109.132,
744
+ "eval_steps_per_second": 2.274,
745
+ "step": 2450
746
+ },
747
+ {
748
+ "epoch": 8.064516129032258,
749
+ "grad_norm": 0.5631929039955139,
750
+ "learning_rate": 0.00016775483870967744,
751
+ "loss": 0.2645,
752
+ "step": 2500
753
+ },
754
+ {
755
+ "epoch": 8.064516129032258,
756
+ "eval_loss": 0.229031041264534,
757
+ "eval_runtime": 321.9761,
758
+ "eval_samples_per_second": 59.023,
759
+ "eval_steps_per_second": 1.23,
760
+ "step": 2500
761
+ },
762
+ {
763
+ "epoch": 8.225806451612904,
764
+ "grad_norm": 0.5553293228149414,
765
+ "learning_rate": 0.00016710967741935483,
766
+ "loss": 0.2604,
767
+ "step": 2550
768
+ },
769
+ {
770
+ "epoch": 8.225806451612904,
771
+ "eval_loss": 0.22551080584526062,
772
+ "eval_runtime": 388.7037,
773
+ "eval_samples_per_second": 48.891,
774
+ "eval_steps_per_second": 1.019,
775
+ "step": 2550
776
+ },
777
+ {
778
+ "epoch": 8.387096774193548,
779
+ "grad_norm": 0.5779751539230347,
780
+ "learning_rate": 0.00016646451612903226,
781
+ "loss": 0.2537,
782
+ "step": 2600
783
+ },
784
+ {
785
+ "epoch": 8.387096774193548,
786
+ "eval_loss": 0.22034035623073578,
787
+ "eval_runtime": 296.8732,
788
+ "eval_samples_per_second": 64.014,
789
+ "eval_steps_per_second": 1.334,
790
+ "step": 2600
791
+ },
792
+ {
793
+ "epoch": 8.548387096774194,
794
+ "grad_norm": 0.4632438123226166,
795
+ "learning_rate": 0.0001658193548387097,
796
+ "loss": 0.2505,
797
+ "step": 2650
798
+ },
799
+ {
800
+ "epoch": 8.548387096774194,
801
+ "eval_loss": 0.21870557963848114,
802
+ "eval_runtime": 195.9343,
803
+ "eval_samples_per_second": 96.992,
804
+ "eval_steps_per_second": 2.021,
805
+ "step": 2650
806
+ },
807
+ {
808
+ "epoch": 8.709677419354838,
809
+ "grad_norm": 0.5242488980293274,
810
+ "learning_rate": 0.0001651741935483871,
811
+ "loss": 0.2465,
812
+ "step": 2700
813
+ },
814
+ {
815
+ "epoch": 8.709677419354838,
816
+ "eval_loss": 0.21318656206130981,
817
+ "eval_runtime": 199.2062,
818
+ "eval_samples_per_second": 95.399,
819
+ "eval_steps_per_second": 1.988,
820
+ "step": 2700
821
+ },
822
+ {
823
+ "epoch": 8.870967741935484,
824
+ "grad_norm": 0.5110979676246643,
825
+ "learning_rate": 0.00016452903225806453,
826
+ "loss": 0.2411,
827
+ "step": 2750
828
+ },
829
+ {
830
+ "epoch": 8.870967741935484,
831
+ "eval_loss": 0.21142736077308655,
832
+ "eval_runtime": 196.2975,
833
+ "eval_samples_per_second": 96.812,
834
+ "eval_steps_per_second": 2.017,
835
+ "step": 2750
836
+ },
837
+ {
838
+ "epoch": 9.03225806451613,
839
+ "grad_norm": 0.4415765702724457,
840
+ "learning_rate": 0.00016388387096774194,
841
+ "loss": 0.2431,
842
+ "step": 2800
843
+ },
844
+ {
845
+ "epoch": 9.03225806451613,
846
+ "eval_loss": 0.20968259871006012,
847
+ "eval_runtime": 190.8363,
848
+ "eval_samples_per_second": 99.583,
849
+ "eval_steps_per_second": 2.075,
850
+ "step": 2800
851
+ },
852
+ {
853
+ "epoch": 9.193548387096774,
854
+ "grad_norm": 0.4640558362007141,
855
+ "learning_rate": 0.00016323870967741935,
856
+ "loss": 0.2416,
857
+ "step": 2850
858
+ },
859
+ {
860
+ "epoch": 9.193548387096774,
861
+ "eval_loss": 0.206209197640419,
862
+ "eval_runtime": 219.8043,
863
+ "eval_samples_per_second": 86.459,
864
+ "eval_steps_per_second": 1.802,
865
+ "step": 2850
866
+ },
867
+ {
868
+ "epoch": 9.35483870967742,
869
+ "grad_norm": 0.5190003514289856,
870
+ "learning_rate": 0.00016259354838709677,
871
+ "loss": 0.2321,
872
+ "step": 2900
873
+ },
874
+ {
875
+ "epoch": 9.35483870967742,
876
+ "eval_loss": 0.20502887666225433,
877
+ "eval_runtime": 232.185,
878
+ "eval_samples_per_second": 81.849,
879
+ "eval_steps_per_second": 1.706,
880
+ "step": 2900
881
+ },
882
+ {
883
+ "epoch": 9.516129032258064,
884
+ "grad_norm": 0.5595571398735046,
885
+ "learning_rate": 0.0001619483870967742,
886
+ "loss": 0.2317,
887
+ "step": 2950
888
+ },
889
+ {
890
+ "epoch": 9.516129032258064,
891
+ "eval_loss": 0.20400060713291168,
892
+ "eval_runtime": 295.1096,
893
+ "eval_samples_per_second": 64.396,
894
+ "eval_steps_per_second": 1.342,
895
+ "step": 2950
896
+ },
897
+ {
898
+ "epoch": 9.67741935483871,
899
+ "grad_norm": 0.5060888528823853,
900
+ "learning_rate": 0.00016130322580645162,
901
+ "loss": 0.2263,
902
+ "step": 3000
903
+ },
904
+ {
905
+ "epoch": 9.67741935483871,
906
+ "eval_loss": 0.2005593329668045,
907
+ "eval_runtime": 201.7676,
908
+ "eval_samples_per_second": 94.188,
909
+ "eval_steps_per_second": 1.963,
910
+ "step": 3000
911
+ },
912
+ {
913
+ "epoch": 9.838709677419354,
914
+ "grad_norm": 0.5235562920570374,
915
+ "learning_rate": 0.00016065806451612903,
916
+ "loss": 0.2209,
917
+ "step": 3050
918
+ },
919
+ {
920
+ "epoch": 9.838709677419354,
921
+ "eval_loss": 0.2015007883310318,
922
+ "eval_runtime": 200.5101,
923
+ "eval_samples_per_second": 94.778,
924
+ "eval_steps_per_second": 1.975,
925
+ "step": 3050
926
+ },
927
+ {
928
+ "epoch": 10.0,
929
+ "grad_norm": 0.7316103577613831,
930
+ "learning_rate": 0.00016001290322580647,
931
+ "loss": 0.2184,
932
+ "step": 3100
933
+ },
934
+ {
935
+ "epoch": 10.0,
936
+ "eval_loss": 0.1977168172597885,
937
+ "eval_runtime": 185.6624,
938
+ "eval_samples_per_second": 102.358,
939
+ "eval_steps_per_second": 2.133,
940
+ "step": 3100
941
+ },
942
+ {
943
+ "epoch": 10.161290322580646,
944
+ "grad_norm": 0.4775584638118744,
945
+ "learning_rate": 0.00015936774193548388,
946
+ "loss": 0.2204,
947
+ "step": 3150
948
+ },
949
+ {
950
+ "epoch": 10.161290322580646,
951
+ "eval_loss": 0.19870446622371674,
952
+ "eval_runtime": 192.3529,
953
+ "eval_samples_per_second": 98.798,
954
+ "eval_steps_per_second": 2.059,
955
+ "step": 3150
956
+ },
957
+ {
958
+ "epoch": 10.32258064516129,
959
+ "grad_norm": 0.4865228533744812,
960
+ "learning_rate": 0.0001587225806451613,
961
+ "loss": 0.2152,
962
+ "step": 3200
963
+ },
964
+ {
965
+ "epoch": 10.32258064516129,
966
+ "eval_loss": 0.19557693600654602,
967
+ "eval_runtime": 195.6795,
968
+ "eval_samples_per_second": 97.118,
969
+ "eval_steps_per_second": 2.024,
970
+ "step": 3200
971
+ },
972
+ {
973
+ "epoch": 10.483870967741936,
974
+ "grad_norm": 0.5472003221511841,
975
+ "learning_rate": 0.0001580774193548387,
976
+ "loss": 0.2188,
977
+ "step": 3250
978
+ },
979
+ {
980
+ "epoch": 10.483870967741936,
981
+ "eval_loss": 0.19073671102523804,
982
+ "eval_runtime": 188.5485,
983
+ "eval_samples_per_second": 100.791,
984
+ "eval_steps_per_second": 2.1,
985
+ "step": 3250
986
+ },
987
+ {
988
+ "epoch": 10.64516129032258,
989
+ "grad_norm": 0.46814078092575073,
990
+ "learning_rate": 0.00015743225806451615,
991
+ "loss": 0.2122,
992
+ "step": 3300
993
+ },
994
+ {
995
+ "epoch": 10.64516129032258,
996
+ "eval_loss": 0.1892285943031311,
997
+ "eval_runtime": 190.3556,
998
+ "eval_samples_per_second": 99.834,
999
+ "eval_steps_per_second": 2.08,
1000
+ "step": 3300
1001
+ },
1002
+ {
1003
+ "epoch": 10.806451612903226,
1004
+ "grad_norm": 0.39085471630096436,
1005
+ "learning_rate": 0.00015678709677419356,
1006
+ "loss": 0.2157,
1007
+ "step": 3350
1008
+ },
1009
+ {
1010
+ "epoch": 10.806451612903226,
1011
+ "eval_loss": 0.19001252949237823,
1012
+ "eval_runtime": 189.4897,
1013
+ "eval_samples_per_second": 100.29,
1014
+ "eval_steps_per_second": 2.09,
1015
+ "step": 3350
1016
+ },
1017
+ {
1018
+ "epoch": 10.967741935483872,
1019
+ "grad_norm": 0.4231501519680023,
1020
+ "learning_rate": 0.00015614193548387097,
1021
+ "loss": 0.2162,
1022
+ "step": 3400
1023
+ },
1024
+ {
1025
+ "epoch": 10.967741935483872,
1026
+ "eval_loss": 0.18620692193508148,
1027
+ "eval_runtime": 190.5331,
1028
+ "eval_samples_per_second": 99.741,
1029
+ "eval_steps_per_second": 2.078,
1030
+ "step": 3400
1031
+ },
1032
+ {
1033
+ "epoch": 11.129032258064516,
1034
+ "grad_norm": 0.5186159610748291,
1035
+ "learning_rate": 0.0001554967741935484,
1036
+ "loss": 0.2101,
1037
+ "step": 3450
1038
+ },
1039
+ {
1040
+ "epoch": 11.129032258064516,
1041
+ "eval_loss": 0.18647471070289612,
1042
+ "eval_runtime": 188.7317,
1043
+ "eval_samples_per_second": 100.693,
1044
+ "eval_steps_per_second": 2.098,
1045
+ "step": 3450
1046
+ },
1047
+ {
1048
+ "epoch": 11.290322580645162,
1049
+ "grad_norm": 0.4359528124332428,
1050
+ "learning_rate": 0.0001548516129032258,
1051
+ "loss": 0.2051,
1052
+ "step": 3500
1053
+ },
1054
+ {
1055
+ "epoch": 11.290322580645162,
1056
+ "eval_loss": 0.18508924543857574,
1057
+ "eval_runtime": 187.5602,
1058
+ "eval_samples_per_second": 101.322,
1059
+ "eval_steps_per_second": 2.111,
1060
+ "step": 3500
1061
+ },
1062
+ {
1063
+ "epoch": 11.451612903225806,
1064
+ "grad_norm": 0.35816341638565063,
1065
+ "learning_rate": 0.00015420645161290324,
1066
+ "loss": 0.2027,
1067
+ "step": 3550
1068
+ },
1069
+ {
1070
+ "epoch": 11.451612903225806,
1071
+ "eval_loss": 0.18324784934520721,
1072
+ "eval_runtime": 188.342,
1073
+ "eval_samples_per_second": 100.902,
1074
+ "eval_steps_per_second": 2.103,
1075
+ "step": 3550
1076
+ },
1077
+ {
1078
+ "epoch": 11.612903225806452,
1079
+ "grad_norm": 0.4818542003631592,
1080
+ "learning_rate": 0.00015356129032258065,
1081
+ "loss": 0.2059,
1082
+ "step": 3600
1083
+ },
1084
+ {
1085
+ "epoch": 11.612903225806452,
1086
+ "eval_loss": 0.18456551432609558,
1087
+ "eval_runtime": 190.2916,
1088
+ "eval_samples_per_second": 99.868,
1089
+ "eval_steps_per_second": 2.081,
1090
+ "step": 3600
1091
+ },
1092
+ {
1093
+ "epoch": 11.774193548387096,
1094
+ "grad_norm": 0.560644805431366,
1095
+ "learning_rate": 0.00015291612903225806,
1096
+ "loss": 0.1994,
1097
+ "step": 3650
1098
+ },
1099
+ {
1100
+ "epoch": 11.774193548387096,
1101
+ "eval_loss": 0.1837019920349121,
1102
+ "eval_runtime": 193.7316,
1103
+ "eval_samples_per_second": 98.094,
1104
+ "eval_steps_per_second": 2.044,
1105
+ "step": 3650
1106
+ },
1107
+ {
1108
+ "epoch": 11.935483870967742,
1109
+ "grad_norm": 0.4187397360801697,
1110
+ "learning_rate": 0.0001522709677419355,
1111
+ "loss": 0.208,
1112
+ "step": 3700
1113
+ },
1114
+ {
1115
+ "epoch": 11.935483870967742,
1116
+ "eval_loss": 0.18199948966503143,
1117
+ "eval_runtime": 190.9935,
1118
+ "eval_samples_per_second": 99.501,
1119
+ "eval_steps_per_second": 2.073,
1120
+ "step": 3700
1121
+ },
1122
+ {
1123
+ "epoch": 12.096774193548388,
1124
+ "grad_norm": 0.47370871901512146,
1125
+ "learning_rate": 0.0001516258064516129,
1126
+ "loss": 0.197,
1127
+ "step": 3750
1128
+ },
1129
+ {
1130
+ "epoch": 12.096774193548388,
1131
+ "eval_loss": 0.18183062970638275,
1132
+ "eval_runtime": 189.271,
1133
+ "eval_samples_per_second": 100.406,
1134
+ "eval_steps_per_second": 2.092,
1135
+ "step": 3750
1136
+ },
1137
+ {
1138
+ "epoch": 12.258064516129032,
1139
+ "grad_norm": 0.5040688514709473,
1140
+ "learning_rate": 0.00015098064516129033,
1141
+ "loss": 0.1984,
1142
+ "step": 3800
1143
+ },
1144
+ {
1145
+ "epoch": 12.258064516129032,
1146
+ "eval_loss": 0.17957282066345215,
1147
+ "eval_runtime": 187.7702,
1148
+ "eval_samples_per_second": 101.209,
1149
+ "eval_steps_per_second": 2.109,
1150
+ "step": 3800
1151
+ },
1152
+ {
1153
+ "epoch": 12.419354838709678,
1154
+ "grad_norm": 0.44129958748817444,
1155
+ "learning_rate": 0.00015033548387096774,
1156
+ "loss": 0.1905,
1157
+ "step": 3850
1158
+ },
1159
+ {
1160
+ "epoch": 12.419354838709678,
1161
+ "eval_loss": 0.17632746696472168,
1162
+ "eval_runtime": 189.2669,
1163
+ "eval_samples_per_second": 100.408,
1164
+ "eval_steps_per_second": 2.092,
1165
+ "step": 3850
1166
+ },
1167
+ {
1168
+ "epoch": 12.580645161290322,
1169
+ "grad_norm": 0.38648995757102966,
1170
+ "learning_rate": 0.00014969032258064518,
1171
+ "loss": 0.1915,
1172
+ "step": 3900
1173
+ },
1174
+ {
1175
+ "epoch": 12.580645161290322,
1176
+ "eval_loss": 0.1763821244239807,
1177
+ "eval_runtime": 187.8282,
1178
+ "eval_samples_per_second": 101.178,
1179
+ "eval_steps_per_second": 2.108,
1180
+ "step": 3900
1181
+ },
1182
+ {
1183
+ "epoch": 12.741935483870968,
1184
+ "grad_norm": 0.46786022186279297,
1185
+ "learning_rate": 0.0001490451612903226,
1186
+ "loss": 0.1964,
1187
+ "step": 3950
1188
+ },
1189
+ {
1190
+ "epoch": 12.741935483870968,
1191
+ "eval_loss": 0.1770430952310562,
1192
+ "eval_runtime": 192.9563,
1193
+ "eval_samples_per_second": 98.489,
1194
+ "eval_steps_per_second": 2.052,
1195
+ "step": 3950
1196
+ },
1197
+ {
1198
+ "epoch": 12.903225806451612,
1199
+ "grad_norm": 0.45657825469970703,
1200
+ "learning_rate": 0.0001484,
1201
+ "loss": 0.194,
1202
+ "step": 4000
1203
+ },
1204
+ {
1205
+ "epoch": 12.903225806451612,
1206
+ "eval_loss": 0.1744171380996704,
1207
+ "eval_runtime": 191.9963,
1208
+ "eval_samples_per_second": 98.981,
1209
+ "eval_steps_per_second": 2.063,
1210
+ "step": 4000
1211
+ },
1212
+ {
1213
+ "epoch": 13.064516129032258,
1214
+ "grad_norm": 0.4950830936431885,
1215
+ "learning_rate": 0.00014775483870967744,
1216
+ "loss": 0.1897,
1217
+ "step": 4050
1218
+ },
1219
+ {
1220
+ "epoch": 13.064516129032258,
1221
+ "eval_loss": 0.1715860813856125,
1222
+ "eval_runtime": 190.2705,
1223
+ "eval_samples_per_second": 99.879,
1224
+ "eval_steps_per_second": 2.081,
1225
+ "step": 4050
1226
+ },
1227
+ {
1228
+ "epoch": 13.225806451612904,
1229
+ "grad_norm": 0.4676801562309265,
1230
+ "learning_rate": 0.00014710967741935485,
1231
+ "loss": 0.1837,
1232
+ "step": 4100
1233
+ },
1234
+ {
1235
+ "epoch": 13.225806451612904,
1236
+ "eval_loss": 0.16961637139320374,
1237
+ "eval_runtime": 177.9195,
1238
+ "eval_samples_per_second": 106.812,
1239
+ "eval_steps_per_second": 2.226,
1240
+ "step": 4100
1241
+ },
1242
+ {
1243
+ "epoch": 13.387096774193548,
1244
+ "grad_norm": 0.3973780870437622,
1245
+ "learning_rate": 0.00014646451612903227,
1246
+ "loss": 0.1856,
1247
+ "step": 4150
1248
+ },
1249
+ {
1250
+ "epoch": 13.387096774193548,
1251
+ "eval_loss": 0.17067177593708038,
1252
+ "eval_runtime": 197.9884,
1253
+ "eval_samples_per_second": 95.985,
1254
+ "eval_steps_per_second": 2.0,
1255
+ "step": 4150
1256
+ },
1257
+ {
1258
+ "epoch": 13.548387096774194,
1259
+ "grad_norm": 0.46780818700790405,
1260
+ "learning_rate": 0.00014581935483870968,
1261
+ "loss": 0.186,
1262
+ "step": 4200
1263
+ },
1264
+ {
1265
+ "epoch": 13.548387096774194,
1266
+ "eval_loss": 0.17197734117507935,
1267
+ "eval_runtime": 196.3462,
1268
+ "eval_samples_per_second": 96.788,
1269
+ "eval_steps_per_second": 2.017,
1270
+ "step": 4200
1271
+ },
1272
+ {
1273
+ "epoch": 13.709677419354838,
1274
+ "grad_norm": 0.5169796347618103,
1275
+ "learning_rate": 0.00014517419354838712,
1276
+ "loss": 0.1886,
1277
+ "step": 4250
1278
+ },
1279
+ {
1280
+ "epoch": 13.709677419354838,
1281
+ "eval_loss": 0.1707415133714676,
1282
+ "eval_runtime": 194.2514,
1283
+ "eval_samples_per_second": 97.832,
1284
+ "eval_steps_per_second": 2.039,
1285
+ "step": 4250
1286
+ },
1287
+ {
1288
+ "epoch": 13.870967741935484,
1289
+ "grad_norm": 0.36574116349220276,
1290
+ "learning_rate": 0.00014452903225806453,
1291
+ "loss": 0.1857,
1292
+ "step": 4300
1293
+ },
1294
+ {
1295
+ "epoch": 13.870967741935484,
1296
+ "eval_loss": 0.16808755695819855,
1297
+ "eval_runtime": 192.206,
1298
+ "eval_samples_per_second": 98.873,
1299
+ "eval_steps_per_second": 2.06,
1300
+ "step": 4300
1301
+ },
1302
+ {
1303
+ "epoch": 14.03225806451613,
1304
+ "grad_norm": 0.4412609338760376,
1305
+ "learning_rate": 0.00014388387096774194,
1306
+ "loss": 0.1819,
1307
+ "step": 4350
1308
+ },
1309
+ {
1310
+ "epoch": 14.03225806451613,
1311
+ "eval_loss": 0.17015832662582397,
1312
+ "eval_runtime": 194.1959,
1313
+ "eval_samples_per_second": 97.86,
1314
+ "eval_steps_per_second": 2.039,
1315
+ "step": 4350
1316
+ },
1317
+ {
1318
+ "epoch": 14.193548387096774,
1319
+ "grad_norm": 0.35714399814605713,
1320
+ "learning_rate": 0.00014323870967741938,
1321
+ "loss": 0.179,
1322
+ "step": 4400
1323
+ },
1324
+ {
1325
+ "epoch": 14.193548387096774,
1326
+ "eval_loss": 0.16659317910671234,
1327
+ "eval_runtime": 190.7696,
1328
+ "eval_samples_per_second": 99.618,
1329
+ "eval_steps_per_second": 2.076,
1330
+ "step": 4400
1331
+ },
1332
+ {
1333
+ "epoch": 14.35483870967742,
1334
+ "grad_norm": 0.5498349070549011,
1335
+ "learning_rate": 0.00014259354838709677,
1336
+ "loss": 0.174,
1337
+ "step": 4450
1338
+ },
1339
+ {
1340
+ "epoch": 14.35483870967742,
1341
+ "eval_loss": 0.166724294424057,
1342
+ "eval_runtime": 194.2466,
1343
+ "eval_samples_per_second": 97.834,
1344
+ "eval_steps_per_second": 2.039,
1345
+ "step": 4450
1346
+ },
1347
+ {
1348
+ "epoch": 14.516129032258064,
1349
+ "grad_norm": 0.3779986798763275,
1350
+ "learning_rate": 0.0001419483870967742,
1351
+ "loss": 0.18,
1352
+ "step": 4500
1353
+ },
1354
+ {
1355
+ "epoch": 14.516129032258064,
1356
+ "eval_loss": 0.16606487333774567,
1357
+ "eval_runtime": 193.3423,
1358
+ "eval_samples_per_second": 98.292,
1359
+ "eval_steps_per_second": 2.048,
1360
+ "step": 4500
1361
+ },
1362
+ {
1363
+ "epoch": 14.67741935483871,
1364
+ "grad_norm": 0.5511322021484375,
1365
+ "learning_rate": 0.00014130322580645162,
1366
+ "loss": 0.1789,
1367
+ "step": 4550
1368
+ },
1369
+ {
1370
+ "epoch": 14.67741935483871,
1371
+ "eval_loss": 0.16656480729579926,
1372
+ "eval_runtime": 270.6846,
1373
+ "eval_samples_per_second": 70.207,
1374
+ "eval_steps_per_second": 1.463,
1375
+ "step": 4550
1376
+ },
1377
+ {
1378
+ "epoch": 14.838709677419354,
1379
+ "grad_norm": 0.3705432415008545,
1380
+ "learning_rate": 0.00014065806451612903,
1381
+ "loss": 0.1793,
1382
+ "step": 4600
1383
+ },
1384
+ {
1385
+ "epoch": 14.838709677419354,
1386
+ "eval_loss": 0.1630878746509552,
1387
+ "eval_runtime": 203.349,
1388
+ "eval_samples_per_second": 93.455,
1389
+ "eval_steps_per_second": 1.947,
1390
+ "step": 4600
1391
+ },
1392
+ {
1393
+ "epoch": 15.0,
1394
+ "grad_norm": 0.6959958076477051,
1395
+ "learning_rate": 0.00014001290322580647,
1396
+ "loss": 0.1849,
1397
+ "step": 4650
1398
+ },
1399
+ {
1400
+ "epoch": 15.0,
1401
+ "eval_loss": 0.16222645342350006,
1402
+ "eval_runtime": 196.7729,
1403
+ "eval_samples_per_second": 96.578,
1404
+ "eval_steps_per_second": 2.012,
1405
+ "step": 4650
1406
+ },
1407
+ {
1408
+ "epoch": 15.161290322580646,
1409
+ "grad_norm": 0.38614702224731445,
1410
+ "learning_rate": 0.00013936774193548388,
1411
+ "loss": 0.1729,
1412
+ "step": 4700
1413
+ },
1414
+ {
1415
+ "epoch": 15.161290322580646,
1416
+ "eval_loss": 0.16200029850006104,
1417
+ "eval_runtime": 191.2671,
1418
+ "eval_samples_per_second": 99.358,
1419
+ "eval_steps_per_second": 2.07,
1420
+ "step": 4700
1421
+ },
1422
+ {
1423
+ "epoch": 15.32258064516129,
1424
+ "grad_norm": 0.4022436738014221,
1425
+ "learning_rate": 0.0001387225806451613,
1426
+ "loss": 0.1766,
1427
+ "step": 4750
1428
+ },
1429
+ {
1430
+ "epoch": 15.32258064516129,
1431
+ "eval_loss": 0.1640605926513672,
1432
+ "eval_runtime": 198.7156,
1433
+ "eval_samples_per_second": 95.634,
1434
+ "eval_steps_per_second": 1.993,
1435
+ "step": 4750
1436
+ },
1437
+ {
1438
+ "epoch": 15.483870967741936,
1439
+ "grad_norm": 0.45434874296188354,
1440
+ "learning_rate": 0.0001380774193548387,
1441
+ "loss": 0.1698,
1442
+ "step": 4800
1443
+ },
1444
+ {
1445
+ "epoch": 15.483870967741936,
1446
+ "eval_loss": 0.16334731876850128,
1447
+ "eval_runtime": 188.1897,
1448
+ "eval_samples_per_second": 100.983,
1449
+ "eval_steps_per_second": 2.104,
1450
+ "step": 4800
1451
+ },
1452
+ {
1453
+ "epoch": 15.64516129032258,
1454
+ "grad_norm": 0.42634057998657227,
1455
+ "learning_rate": 0.00013743225806451615,
1456
+ "loss": 0.1766,
1457
+ "step": 4850
1458
+ },
1459
+ {
1460
+ "epoch": 15.64516129032258,
1461
+ "eval_loss": 0.16068318486213684,
1462
+ "eval_runtime": 190.5766,
1463
+ "eval_samples_per_second": 99.718,
1464
+ "eval_steps_per_second": 2.078,
1465
+ "step": 4850
1466
+ },
1467
+ {
1468
+ "epoch": 15.806451612903226,
1469
+ "grad_norm": 0.504154622554779,
1470
+ "learning_rate": 0.00013678709677419353,
1471
+ "loss": 0.1678,
1472
+ "step": 4900
1473
+ },
1474
+ {
1475
+ "epoch": 15.806451612903226,
1476
+ "eval_loss": 0.15998931229114532,
1477
+ "eval_runtime": 185.5464,
1478
+ "eval_samples_per_second": 102.422,
1479
+ "eval_steps_per_second": 2.134,
1480
+ "step": 4900
1481
+ },
1482
+ {
1483
+ "epoch": 15.967741935483872,
1484
+ "grad_norm": 0.5468097925186157,
1485
+ "learning_rate": 0.00013614193548387097,
1486
+ "loss": 0.1792,
1487
+ "step": 4950
1488
+ },
1489
+ {
1490
+ "epoch": 15.967741935483872,
1491
+ "eval_loss": 0.15972544252872467,
1492
+ "eval_runtime": 186.5203,
1493
+ "eval_samples_per_second": 101.887,
1494
+ "eval_steps_per_second": 2.123,
1495
+ "step": 4950
1496
+ },
1497
+ {
1498
+ "epoch": 16.129032258064516,
1499
+ "grad_norm": 0.4495028853416443,
1500
+ "learning_rate": 0.0001354967741935484,
1501
+ "loss": 0.1755,
1502
+ "step": 5000
1503
+ },
1504
+ {
1505
+ "epoch": 16.129032258064516,
1506
+ "eval_loss": 0.1596982628107071,
1507
+ "eval_runtime": 194.6559,
1508
+ "eval_samples_per_second": 97.629,
1509
+ "eval_steps_per_second": 2.034,
1510
+ "step": 5000
1511
+ },
1512
+ {
1513
+ "epoch": 16.29032258064516,
1514
+ "grad_norm": 0.36081984639167786,
1515
+ "learning_rate": 0.0001348516129032258,
1516
+ "loss": 0.1659,
1517
+ "step": 5050
1518
+ },
1519
+ {
1520
+ "epoch": 16.29032258064516,
1521
+ "eval_loss": 0.15741322934627533,
1522
+ "eval_runtime": 193.3704,
1523
+ "eval_samples_per_second": 98.278,
1524
+ "eval_steps_per_second": 2.048,
1525
+ "step": 5050
1526
+ },
1527
+ {
1528
+ "epoch": 16.451612903225808,
1529
+ "grad_norm": 0.3472287654876709,
1530
+ "learning_rate": 0.00013420645161290324,
1531
+ "loss": 0.1672,
1532
+ "step": 5100
1533
+ },
1534
+ {
1535
+ "epoch": 16.451612903225808,
1536
+ "eval_loss": 0.16025249660015106,
1537
+ "eval_runtime": 177.054,
1538
+ "eval_samples_per_second": 107.334,
1539
+ "eval_steps_per_second": 2.237,
1540
+ "step": 5100
1541
+ },
1542
+ {
1543
+ "epoch": 16.612903225806452,
1544
+ "grad_norm": 0.4033275842666626,
1545
+ "learning_rate": 0.00013356129032258065,
1546
+ "loss": 0.1687,
1547
+ "step": 5150
1548
+ },
1549
+ {
1550
+ "epoch": 16.612903225806452,
1551
+ "eval_loss": 0.15610146522521973,
1552
+ "eval_runtime": 197.4165,
1553
+ "eval_samples_per_second": 96.263,
1554
+ "eval_steps_per_second": 2.006,
1555
+ "step": 5150
1556
+ },
1557
+ {
1558
+ "epoch": 16.774193548387096,
1559
+ "grad_norm": 0.43590107560157776,
1560
+ "learning_rate": 0.00013291612903225806,
1561
+ "loss": 0.1685,
1562
+ "step": 5200
1563
+ },
1564
+ {
1565
+ "epoch": 16.774193548387096,
1566
+ "eval_loss": 0.15723493695259094,
1567
+ "eval_runtime": 195.3122,
1568
+ "eval_samples_per_second": 97.301,
1569
+ "eval_steps_per_second": 2.028,
1570
+ "step": 5200
1571
+ },
1572
+ {
1573
+ "epoch": 16.93548387096774,
1574
+ "grad_norm": 0.33427444100379944,
1575
+ "learning_rate": 0.00013227096774193548,
1576
+ "loss": 0.1711,
1577
+ "step": 5250
1578
+ },
1579
+ {
1580
+ "epoch": 16.93548387096774,
1581
+ "eval_loss": 0.15542824566364288,
1582
+ "eval_runtime": 192.6293,
1583
+ "eval_samples_per_second": 98.656,
1584
+ "eval_steps_per_second": 2.056,
1585
+ "step": 5250
1586
+ },
1587
+ {
1588
+ "epoch": 17.096774193548388,
1589
+ "grad_norm": 0.38596487045288086,
1590
+ "learning_rate": 0.00013162580645161291,
1591
+ "loss": 0.1662,
1592
+ "step": 5300
1593
+ },
1594
+ {
1595
+ "epoch": 17.096774193548388,
1596
+ "eval_loss": 0.15477755665779114,
1597
+ "eval_runtime": 218.9351,
1598
+ "eval_samples_per_second": 86.802,
1599
+ "eval_steps_per_second": 1.809,
1600
+ "step": 5300
1601
+ },
1602
+ {
1603
+ "epoch": 17.258064516129032,
1604
+ "grad_norm": 0.30149680376052856,
1605
+ "learning_rate": 0.00013098064516129033,
1606
+ "loss": 0.1632,
1607
+ "step": 5350
1608
+ },
1609
+ {
1610
+ "epoch": 17.258064516129032,
1611
+ "eval_loss": 0.15298867225646973,
1612
+ "eval_runtime": 109.5329,
1613
+ "eval_samples_per_second": 173.5,
1614
+ "eval_steps_per_second": 3.615,
1615
+ "step": 5350
1616
+ },
1617
+ {
1618
+ "epoch": 17.419354838709676,
1619
+ "grad_norm": 0.34499385952949524,
1620
+ "learning_rate": 0.00013033548387096774,
1621
+ "loss": 0.1627,
1622
+ "step": 5400
1623
+ },
1624
+ {
1625
+ "epoch": 17.419354838709676,
1626
+ "eval_loss": 0.1536460816860199,
1627
+ "eval_runtime": 89.4248,
1628
+ "eval_samples_per_second": 212.514,
1629
+ "eval_steps_per_second": 4.428,
1630
+ "step": 5400
1631
+ },
1632
+ {
1633
+ "epoch": 17.580645161290324,
1634
+ "grad_norm": 0.4599091410636902,
1635
+ "learning_rate": 0.00012969032258064518,
1636
+ "loss": 0.1611,
1637
+ "step": 5450
1638
+ },
1639
+ {
1640
+ "epoch": 17.580645161290324,
1641
+ "eval_loss": 0.15192009508609772,
1642
+ "eval_runtime": 87.9147,
1643
+ "eval_samples_per_second": 216.164,
1644
+ "eval_steps_per_second": 4.504,
1645
+ "step": 5450
1646
+ },
1647
+ {
1648
+ "epoch": 17.741935483870968,
1649
+ "grad_norm": 0.3737597167491913,
1650
+ "learning_rate": 0.0001290451612903226,
1651
+ "loss": 0.1644,
1652
+ "step": 5500
1653
+ },
1654
+ {
1655
+ "epoch": 17.741935483870968,
1656
+ "eval_loss": 0.15356366336345673,
1657
+ "eval_runtime": 87.8138,
1658
+ "eval_samples_per_second": 216.413,
1659
+ "eval_steps_per_second": 4.51,
1660
+ "step": 5500
1661
+ },
1662
+ {
1663
+ "epoch": 17.903225806451612,
1664
+ "grad_norm": 0.49820005893707275,
1665
+ "learning_rate": 0.0001284,
1666
+ "loss": 0.1641,
1667
+ "step": 5550
1668
+ },
1669
+ {
1670
+ "epoch": 17.903225806451612,
1671
+ "eval_loss": 0.15130773186683655,
1672
+ "eval_runtime": 89.806,
1673
+ "eval_samples_per_second": 211.612,
1674
+ "eval_steps_per_second": 4.41,
1675
+ "step": 5550
1676
+ },
1677
+ {
1678
+ "epoch": 18.06451612903226,
1679
+ "grad_norm": 0.431963711977005,
1680
+ "learning_rate": 0.00012775483870967742,
1681
+ "loss": 0.1629,
1682
+ "step": 5600
1683
+ },
1684
+ {
1685
+ "epoch": 18.06451612903226,
1686
+ "eval_loss": 0.15356026589870453,
1687
+ "eval_runtime": 90.5983,
1688
+ "eval_samples_per_second": 209.761,
1689
+ "eval_steps_per_second": 4.371,
1690
+ "step": 5600
1691
+ },
1692
+ {
1693
+ "epoch": 18.225806451612904,
1694
+ "grad_norm": 0.41033461689949036,
1695
+ "learning_rate": 0.00012710967741935486,
1696
+ "loss": 0.1592,
1697
+ "step": 5650
1698
+ },
1699
+ {
1700
+ "epoch": 18.225806451612904,
1701
+ "eval_loss": 0.15416103601455688,
1702
+ "eval_runtime": 88.2514,
1703
+ "eval_samples_per_second": 215.339,
1704
+ "eval_steps_per_second": 4.487,
1705
+ "step": 5650
1706
+ },
1707
+ {
1708
+ "epoch": 18.387096774193548,
1709
+ "grad_norm": 0.360398530960083,
1710
+ "learning_rate": 0.00012646451612903227,
1711
+ "loss": 0.1595,
1712
+ "step": 5700
1713
+ },
1714
+ {
1715
+ "epoch": 18.387096774193548,
1716
+ "eval_loss": 0.1532827764749527,
1717
+ "eval_runtime": 89.7789,
1718
+ "eval_samples_per_second": 211.675,
1719
+ "eval_steps_per_second": 4.411,
1720
+ "step": 5700
1721
+ },
1722
+ {
1723
+ "epoch": 18.548387096774192,
1724
+ "grad_norm": 0.3443894386291504,
1725
+ "learning_rate": 0.00012581935483870968,
1726
+ "loss": 0.1627,
1727
+ "step": 5750
1728
+ },
1729
+ {
1730
+ "epoch": 18.548387096774192,
1731
+ "eval_loss": 0.15046313405036926,
1732
+ "eval_runtime": 88.4154,
1733
+ "eval_samples_per_second": 214.94,
1734
+ "eval_steps_per_second": 4.479,
1735
+ "step": 5750
1736
+ },
1737
+ {
1738
+ "epoch": 18.70967741935484,
1739
+ "grad_norm": 0.384339839220047,
1740
+ "learning_rate": 0.00012517419354838712,
1741
+ "loss": 0.1609,
1742
+ "step": 5800
1743
+ },
1744
+ {
1745
+ "epoch": 18.70967741935484,
1746
+ "eval_loss": 0.1508285254240036,
1747
+ "eval_runtime": 89.1892,
1748
+ "eval_samples_per_second": 213.075,
1749
+ "eval_steps_per_second": 4.44,
1750
+ "step": 5800
1751
+ },
1752
+ {
1753
+ "epoch": 18.870967741935484,
1754
+ "grad_norm": 0.4039391577243805,
1755
+ "learning_rate": 0.0001245290322580645,
1756
+ "loss": 0.1593,
1757
+ "step": 5850
1758
+ },
1759
+ {
1760
+ "epoch": 18.870967741935484,
1761
+ "eval_loss": 0.15185365080833435,
1762
+ "eval_runtime": 87.6173,
1763
+ "eval_samples_per_second": 216.898,
1764
+ "eval_steps_per_second": 4.52,
1765
+ "step": 5850
1766
+ },
1767
+ {
1768
+ "epoch": 19.032258064516128,
1769
+ "grad_norm": 0.36354830861091614,
1770
+ "learning_rate": 0.00012388387096774195,
1771
+ "loss": 0.1609,
1772
+ "step": 5900
1773
+ },
1774
+ {
1775
+ "epoch": 19.032258064516128,
1776
+ "eval_loss": 0.15022191405296326,
1777
+ "eval_runtime": 88.0513,
1778
+ "eval_samples_per_second": 215.829,
1779
+ "eval_steps_per_second": 4.497,
1780
+ "step": 5900
1781
+ },
1782
+ {
1783
+ "epoch": 19.193548387096776,
1784
+ "grad_norm": 0.41100090742111206,
1785
+ "learning_rate": 0.00012323870967741936,
1786
+ "loss": 0.1573,
1787
+ "step": 5950
1788
+ },
1789
+ {
1790
+ "epoch": 19.193548387096776,
1791
+ "eval_loss": 0.14913968741893768,
1792
+ "eval_runtime": 89.4022,
1793
+ "eval_samples_per_second": 212.568,
1794
+ "eval_steps_per_second": 4.429,
1795
+ "step": 5950
1796
+ },
1797
+ {
1798
+ "epoch": 19.35483870967742,
1799
+ "grad_norm": 0.3832205832004547,
1800
+ "learning_rate": 0.00012259354838709677,
1801
+ "loss": 0.1542,
1802
+ "step": 6000
1803
+ },
1804
+ {
1805
+ "epoch": 19.35483870967742,
1806
+ "eval_loss": 0.14829514920711517,
1807
+ "eval_runtime": 86.5784,
1808
+ "eval_samples_per_second": 219.5,
1809
+ "eval_steps_per_second": 4.574,
1810
+ "step": 6000
1811
+ },
1812
+ {
1813
+ "epoch": 19.516129032258064,
1814
+ "grad_norm": 0.3583919405937195,
1815
+ "learning_rate": 0.00012194838709677421,
1816
+ "loss": 0.1562,
1817
+ "step": 6050
1818
+ },
1819
+ {
1820
+ "epoch": 19.516129032258064,
1821
+ "eval_loss": 0.14888739585876465,
1822
+ "eval_runtime": 84.8338,
1823
+ "eval_samples_per_second": 224.014,
1824
+ "eval_steps_per_second": 4.668,
1825
+ "step": 6050
1826
+ },
1827
+ {
1828
+ "epoch": 19.677419354838708,
1829
+ "grad_norm": 0.3783506751060486,
1830
+ "learning_rate": 0.00012130322580645161,
1831
+ "loss": 0.1551,
1832
+ "step": 6100
1833
+ },
1834
+ {
1835
+ "epoch": 19.677419354838708,
1836
+ "eval_loss": 0.1506240963935852,
1837
+ "eval_runtime": 87.3622,
1838
+ "eval_samples_per_second": 217.531,
1839
+ "eval_steps_per_second": 4.533,
1840
+ "step": 6100
1841
+ },
1842
+ {
1843
+ "epoch": 19.838709677419356,
1844
+ "grad_norm": 0.39638015627861023,
1845
+ "learning_rate": 0.00012065806451612905,
1846
+ "loss": 0.1544,
1847
+ "step": 6150
1848
+ },
1849
+ {
1850
+ "epoch": 19.838709677419356,
1851
+ "eval_loss": 0.1477993279695511,
1852
+ "eval_runtime": 90.2688,
1853
+ "eval_samples_per_second": 210.527,
1854
+ "eval_steps_per_second": 4.387,
1855
+ "step": 6150
1856
+ },
1857
+ {
1858
+ "epoch": 20.0,
1859
+ "grad_norm": 0.5352652072906494,
1860
+ "learning_rate": 0.00012001290322580645,
1861
+ "loss": 0.1558,
1862
+ "step": 6200
1863
+ },
1864
+ {
1865
+ "epoch": 20.0,
1866
+ "eval_loss": 0.1461372971534729,
1867
+ "eval_runtime": 88.0458,
1868
+ "eval_samples_per_second": 215.842,
1869
+ "eval_steps_per_second": 4.498,
1870
+ "step": 6200
1871
+ },
1872
+ {
1873
+ "epoch": 20.161290322580644,
1874
+ "grad_norm": 0.32452672719955444,
1875
+ "learning_rate": 0.00011936774193548387,
1876
+ "loss": 0.15,
1877
+ "step": 6250
1878
+ },
1879
+ {
1880
+ "epoch": 20.161290322580644,
1881
+ "eval_loss": 0.14649870991706848,
1882
+ "eval_runtime": 86.0304,
1883
+ "eval_samples_per_second": 220.899,
1884
+ "eval_steps_per_second": 4.603,
1885
+ "step": 6250
1886
+ },
1887
+ {
1888
+ "epoch": 20.322580645161292,
1889
+ "grad_norm": 0.3544420599937439,
1890
+ "learning_rate": 0.00011872258064516129,
1891
+ "loss": 0.1451,
1892
+ "step": 6300
1893
+ },
1894
+ {
1895
+ "epoch": 20.322580645161292,
1896
+ "eval_loss": 0.14727556705474854,
1897
+ "eval_runtime": 89.6974,
1898
+ "eval_samples_per_second": 211.868,
1899
+ "eval_steps_per_second": 4.415,
1900
+ "step": 6300
1901
+ },
1902
+ {
1903
+ "epoch": 20.483870967741936,
1904
+ "grad_norm": 0.3795066773891449,
1905
+ "learning_rate": 0.00011807741935483871,
1906
+ "loss": 0.1544,
1907
+ "step": 6350
1908
+ },
1909
+ {
1910
+ "epoch": 20.483870967741936,
1911
+ "eval_loss": 0.1442754566669464,
1912
+ "eval_runtime": 89.414,
1913
+ "eval_samples_per_second": 212.54,
1914
+ "eval_steps_per_second": 4.429,
1915
+ "step": 6350
1916
+ },
1917
+ {
1918
+ "epoch": 20.64516129032258,
1919
+ "grad_norm": 0.4115369915962219,
1920
+ "learning_rate": 0.00011743225806451614,
1921
+ "loss": 0.1507,
1922
+ "step": 6400
1923
+ },
1924
+ {
1925
+ "epoch": 20.64516129032258,
1926
+ "eval_loss": 0.14425314962863922,
1927
+ "eval_runtime": 88.6568,
1928
+ "eval_samples_per_second": 214.355,
1929
+ "eval_steps_per_second": 4.467,
1930
+ "step": 6400
1931
+ },
1932
+ {
1933
+ "epoch": 20.806451612903224,
1934
+ "grad_norm": 0.38675764203071594,
1935
+ "learning_rate": 0.00011678709677419355,
1936
+ "loss": 0.1497,
1937
+ "step": 6450
1938
+ },
1939
+ {
1940
+ "epoch": 20.806451612903224,
1941
+ "eval_loss": 0.1459958851337433,
1942
+ "eval_runtime": 87.6832,
1943
+ "eval_samples_per_second": 216.735,
1944
+ "eval_steps_per_second": 4.516,
1945
+ "step": 6450
1946
+ },
1947
+ {
1948
+ "epoch": 20.967741935483872,
1949
+ "grad_norm": 0.37480926513671875,
1950
+ "learning_rate": 0.00011614193548387098,
1951
+ "loss": 0.1509,
1952
+ "step": 6500
1953
+ },
1954
+ {
1955
+ "epoch": 20.967741935483872,
1956
+ "eval_loss": 0.14522159099578857,
1957
+ "eval_runtime": 86.8578,
1958
+ "eval_samples_per_second": 218.794,
1959
+ "eval_steps_per_second": 4.559,
1960
+ "step": 6500
1961
+ },
1962
+ {
1963
+ "epoch": 21.129032258064516,
1964
+ "grad_norm": 0.3740350306034088,
1965
+ "learning_rate": 0.00011549677419354839,
1966
+ "loss": 0.1504,
1967
+ "step": 6550
1968
+ },
1969
+ {
1970
+ "epoch": 21.129032258064516,
1971
+ "eval_loss": 0.14784836769104004,
1972
+ "eval_runtime": 85.4911,
1973
+ "eval_samples_per_second": 222.292,
1974
+ "eval_steps_per_second": 4.632,
1975
+ "step": 6550
1976
+ },
1977
+ {
1978
+ "epoch": 21.29032258064516,
1979
+ "grad_norm": 0.4533497095108032,
1980
+ "learning_rate": 0.00011485161290322581,
1981
+ "loss": 0.1517,
1982
+ "step": 6600
1983
+ },
1984
+ {
1985
+ "epoch": 21.29032258064516,
1986
+ "eval_loss": 0.14581461250782013,
1987
+ "eval_runtime": 86.0863,
1988
+ "eval_samples_per_second": 220.755,
1989
+ "eval_steps_per_second": 4.6,
1990
+ "step": 6600
1991
+ },
1992
+ {
1993
+ "epoch": 21.451612903225808,
1994
+ "grad_norm": 0.3758571743965149,
1995
+ "learning_rate": 0.00011420645161290323,
1996
+ "loss": 0.1452,
1997
+ "step": 6650
1998
+ },
1999
+ {
2000
+ "epoch": 21.451612903225808,
2001
+ "eval_loss": 0.1412593424320221,
2002
+ "eval_runtime": 86.2595,
2003
+ "eval_samples_per_second": 220.312,
2004
+ "eval_steps_per_second": 4.591,
2005
+ "step": 6650
2006
+ },
2007
+ {
2008
+ "epoch": 21.612903225806452,
2009
+ "grad_norm": 0.3700609803199768,
2010
+ "learning_rate": 0.00011356129032258065,
2011
+ "loss": 0.1461,
2012
+ "step": 6700
2013
+ },
2014
+ {
2015
+ "epoch": 21.612903225806452,
2016
+ "eval_loss": 0.1432737410068512,
2017
+ "eval_runtime": 86.1269,
2018
+ "eval_samples_per_second": 220.651,
2019
+ "eval_steps_per_second": 4.598,
2020
+ "step": 6700
2021
+ },
2022
+ {
2023
+ "epoch": 21.774193548387096,
2024
+ "grad_norm": 0.31164905428886414,
2025
+ "learning_rate": 0.00011291612903225808,
2026
+ "loss": 0.1463,
2027
+ "step": 6750
2028
+ },
2029
+ {
2030
+ "epoch": 21.774193548387096,
2031
+ "eval_loss": 0.14101718366146088,
2032
+ "eval_runtime": 88.7161,
2033
+ "eval_samples_per_second": 214.211,
2034
+ "eval_steps_per_second": 4.464,
2035
+ "step": 6750
2036
+ },
2037
+ {
2038
+ "epoch": 21.93548387096774,
2039
+ "grad_norm": 0.3831172287464142,
2040
+ "learning_rate": 0.00011227096774193549,
2041
+ "loss": 0.1509,
2042
+ "step": 6800
2043
+ },
2044
+ {
2045
+ "epoch": 21.93548387096774,
2046
+ "eval_loss": 0.14286787807941437,
2047
+ "eval_runtime": 88.4285,
2048
+ "eval_samples_per_second": 214.908,
2049
+ "eval_steps_per_second": 4.478,
2050
+ "step": 6800
2051
+ },
2052
+ {
2053
+ "epoch": 22.096774193548388,
2054
+ "grad_norm": 0.3675175905227661,
2055
+ "learning_rate": 0.00011162580645161292,
2056
+ "loss": 0.1478,
2057
+ "step": 6850
2058
+ },
2059
+ {
2060
+ "epoch": 22.096774193548388,
2061
+ "eval_loss": 0.14311262965202332,
2062
+ "eval_runtime": 86.1923,
2063
+ "eval_samples_per_second": 220.484,
2064
+ "eval_steps_per_second": 4.594,
2065
+ "step": 6850
2066
+ },
2067
+ {
2068
+ "epoch": 22.258064516129032,
2069
+ "grad_norm": 0.4077725410461426,
2070
+ "learning_rate": 0.00011098064516129032,
2071
+ "loss": 0.1455,
2072
+ "step": 6900
2073
+ },
2074
+ {
2075
+ "epoch": 22.258064516129032,
2076
+ "eval_loss": 0.1406867653131485,
2077
+ "eval_runtime": 88.1595,
2078
+ "eval_samples_per_second": 215.564,
2079
+ "eval_steps_per_second": 4.492,
2080
+ "step": 6900
2081
+ },
2082
+ {
2083
+ "epoch": 22.419354838709676,
2084
+ "grad_norm": 0.37918218970298767,
2085
+ "learning_rate": 0.00011033548387096775,
2086
+ "loss": 0.1438,
2087
+ "step": 6950
2088
+ },
2089
+ {
2090
+ "epoch": 22.419354838709676,
2091
+ "eval_loss": 0.14213036000728607,
2092
+ "eval_runtime": 88.6161,
2093
+ "eval_samples_per_second": 214.453,
2094
+ "eval_steps_per_second": 4.469,
2095
+ "step": 6950
2096
+ },
2097
+ {
2098
+ "epoch": 22.580645161290324,
2099
+ "grad_norm": 0.4112975597381592,
2100
+ "learning_rate": 0.00010969032258064518,
2101
+ "loss": 0.1471,
2102
+ "step": 7000
2103
+ },
2104
+ {
2105
+ "epoch": 22.580645161290324,
2106
+ "eval_loss": 0.14302890002727509,
2107
+ "eval_runtime": 87.2404,
2108
+ "eval_samples_per_second": 217.835,
2109
+ "eval_steps_per_second": 4.539,
2110
+ "step": 7000
2111
+ },
2112
+ {
2113
+ "epoch": 22.741935483870968,
2114
+ "grad_norm": 0.3555707335472107,
2115
+ "learning_rate": 0.00010904516129032258,
2116
+ "loss": 0.1435,
2117
+ "step": 7050
2118
+ },
2119
+ {
2120
+ "epoch": 22.741935483870968,
2121
+ "eval_loss": 0.14145122468471527,
2122
+ "eval_runtime": 87.7191,
2123
+ "eval_samples_per_second": 216.646,
2124
+ "eval_steps_per_second": 4.514,
2125
+ "step": 7050
2126
+ },
2127
+ {
2128
+ "epoch": 22.903225806451612,
2129
+ "grad_norm": 0.33775362372398376,
2130
+ "learning_rate": 0.00010840000000000002,
2131
+ "loss": 0.1459,
2132
+ "step": 7100
2133
+ },
2134
+ {
2135
+ "epoch": 22.903225806451612,
2136
+ "eval_loss": 0.14196287095546722,
2137
+ "eval_runtime": 85.2736,
2138
+ "eval_samples_per_second": 222.859,
2139
+ "eval_steps_per_second": 4.644,
2140
+ "step": 7100
2141
+ },
2142
+ {
2143
+ "epoch": 23.06451612903226,
2144
+ "grad_norm": 0.40644168853759766,
2145
+ "learning_rate": 0.00010775483870967742,
2146
+ "loss": 0.1403,
2147
+ "step": 7150
2148
+ },
2149
+ {
2150
+ "epoch": 23.06451612903226,
2151
+ "eval_loss": 0.13985274732112885,
2152
+ "eval_runtime": 88.2679,
2153
+ "eval_samples_per_second": 215.299,
2154
+ "eval_steps_per_second": 4.486,
2155
+ "step": 7150
2156
+ },
2157
+ {
2158
+ "epoch": 23.225806451612904,
2159
+ "grad_norm": 0.30164963006973267,
2160
+ "learning_rate": 0.00010710967741935484,
2161
+ "loss": 0.1438,
2162
+ "step": 7200
2163
+ },
2164
+ {
2165
+ "epoch": 23.225806451612904,
2166
+ "eval_loss": 0.13959668576717377,
2167
+ "eval_runtime": 85.8815,
2168
+ "eval_samples_per_second": 221.282,
2169
+ "eval_steps_per_second": 4.611,
2170
+ "step": 7200
2171
+ },
2172
+ {
2173
+ "epoch": 23.387096774193548,
2174
+ "grad_norm": 0.41760918498039246,
2175
+ "learning_rate": 0.00010646451612903226,
2176
+ "loss": 0.1455,
2177
+ "step": 7250
2178
+ },
2179
+ {
2180
+ "epoch": 23.387096774193548,
2181
+ "eval_loss": 0.1405312865972519,
2182
+ "eval_runtime": 87.4804,
2183
+ "eval_samples_per_second": 217.237,
2184
+ "eval_steps_per_second": 4.527,
2185
+ "step": 7250
2186
+ },
2187
+ {
2188
+ "epoch": 23.548387096774192,
2189
+ "grad_norm": 0.31449875235557556,
2190
+ "learning_rate": 0.00010581935483870968,
2191
+ "loss": 0.1416,
2192
+ "step": 7300
2193
+ },
2194
+ {
2195
+ "epoch": 23.548387096774192,
2196
+ "eval_loss": 0.1397952139377594,
2197
+ "eval_runtime": 84.3693,
2198
+ "eval_samples_per_second": 225.248,
2199
+ "eval_steps_per_second": 4.694,
2200
+ "step": 7300
2201
+ },
2202
+ {
2203
+ "epoch": 23.70967741935484,
2204
+ "grad_norm": 0.34104588627815247,
2205
+ "learning_rate": 0.00010517419354838711,
2206
+ "loss": 0.143,
2207
+ "step": 7350
2208
+ },
2209
+ {
2210
+ "epoch": 23.70967741935484,
2211
+ "eval_loss": 0.13995403051376343,
2212
+ "eval_runtime": 88.9533,
2213
+ "eval_samples_per_second": 213.64,
2214
+ "eval_steps_per_second": 4.452,
2215
+ "step": 7350
2216
+ },
2217
+ {
2218
+ "epoch": 23.870967741935484,
2219
+ "grad_norm": 0.43316343426704407,
2220
+ "learning_rate": 0.00010452903225806452,
2221
+ "loss": 0.1432,
2222
+ "step": 7400
2223
+ },
2224
+ {
2225
+ "epoch": 23.870967741935484,
2226
+ "eval_loss": 0.13980048894882202,
2227
+ "eval_runtime": 87.8388,
2228
+ "eval_samples_per_second": 216.351,
2229
+ "eval_steps_per_second": 4.508,
2230
+ "step": 7400
2231
+ },
2232
+ {
2233
+ "epoch": 24.032258064516128,
2234
+ "grad_norm": 0.35213446617126465,
2235
+ "learning_rate": 0.00010388387096774195,
2236
+ "loss": 0.1463,
2237
+ "step": 7450
2238
+ },
2239
+ {
2240
+ "epoch": 24.032258064516128,
2241
+ "eval_loss": 0.1396203339099884,
2242
+ "eval_runtime": 86.1081,
2243
+ "eval_samples_per_second": 220.699,
2244
+ "eval_steps_per_second": 4.599,
2245
+ "step": 7450
2246
+ },
2247
+ {
2248
+ "epoch": 24.193548387096776,
2249
+ "grad_norm": 0.2780129313468933,
2250
+ "learning_rate": 0.00010323870967741936,
2251
+ "loss": 0.1396,
2252
+ "step": 7500
2253
+ },
2254
+ {
2255
+ "epoch": 24.193548387096776,
2256
+ "eval_loss": 0.13865940272808075,
2257
+ "eval_runtime": 84.8365,
2258
+ "eval_samples_per_second": 224.007,
2259
+ "eval_steps_per_second": 4.668,
2260
+ "step": 7500
2261
+ },
2262
+ {
2263
+ "epoch": 24.35483870967742,
2264
+ "grad_norm": 0.34334343671798706,
2265
+ "learning_rate": 0.00010259354838709679,
2266
+ "loss": 0.1395,
2267
+ "step": 7550
2268
+ },
2269
+ {
2270
+ "epoch": 24.35483870967742,
2271
+ "eval_loss": 0.1386643797159195,
2272
+ "eval_runtime": 85.7437,
2273
+ "eval_samples_per_second": 221.637,
2274
+ "eval_steps_per_second": 4.618,
2275
+ "step": 7550
2276
+ },
2277
+ {
2278
+ "epoch": 24.516129032258064,
2279
+ "grad_norm": 0.3119650185108185,
2280
+ "learning_rate": 0.00010194838709677418,
2281
+ "loss": 0.1381,
2282
+ "step": 7600
2283
+ },
2284
+ {
2285
+ "epoch": 24.516129032258064,
2286
+ "eval_loss": 0.1378747671842575,
2287
+ "eval_runtime": 94.9067,
2288
+ "eval_samples_per_second": 200.239,
2289
+ "eval_steps_per_second": 4.173,
2290
+ "step": 7600
2291
+ },
2292
+ {
2293
+ "epoch": 24.677419354838708,
2294
+ "grad_norm": 0.36497557163238525,
2295
+ "learning_rate": 0.00010130322580645162,
2296
+ "loss": 0.1429,
2297
+ "step": 7650
2298
+ },
2299
+ {
2300
+ "epoch": 24.677419354838708,
2301
+ "eval_loss": 0.1373891532421112,
2302
+ "eval_runtime": 86.0356,
2303
+ "eval_samples_per_second": 220.885,
2304
+ "eval_steps_per_second": 4.603,
2305
+ "step": 7650
2306
+ },
2307
+ {
2308
+ "epoch": 24.838709677419356,
2309
+ "grad_norm": 0.3456083834171295,
2310
+ "learning_rate": 0.00010065806451612905,
2311
+ "loss": 0.1349,
2312
+ "step": 7700
2313
+ },
2314
+ {
2315
+ "epoch": 24.838709677419356,
2316
+ "eval_loss": 0.1388697326183319,
2317
+ "eval_runtime": 85.5136,
2318
+ "eval_samples_per_second": 222.234,
2319
+ "eval_steps_per_second": 4.631,
2320
+ "step": 7700
2321
+ },
2322
+ {
2323
+ "epoch": 25.0,
2324
+ "grad_norm": 0.4443909227848053,
2325
+ "learning_rate": 0.00010001290322580645,
2326
+ "loss": 0.1403,
2327
+ "step": 7750
2328
+ },
2329
+ {
2330
+ "epoch": 25.0,
2331
+ "eval_loss": 0.1377411037683487,
2332
+ "eval_runtime": 91.0633,
2333
+ "eval_samples_per_second": 208.69,
2334
+ "eval_steps_per_second": 4.349,
2335
+ "step": 7750
2336
+ },
2337
+ {
2338
+ "epoch": 25.161290322580644,
2339
+ "grad_norm": 0.34777510166168213,
2340
+ "learning_rate": 9.936774193548387e-05,
2341
+ "loss": 0.1383,
2342
+ "step": 7800
2343
+ },
2344
+ {
2345
+ "epoch": 25.161290322580644,
2346
+ "eval_loss": 0.13685546815395355,
2347
+ "eval_runtime": 88.0711,
2348
+ "eval_samples_per_second": 215.78,
2349
+ "eval_steps_per_second": 4.496,
2350
+ "step": 7800
2351
+ },
2352
+ {
2353
+ "epoch": 25.322580645161292,
2354
+ "grad_norm": 0.35419756174087524,
2355
+ "learning_rate": 9.87225806451613e-05,
2356
+ "loss": 0.1348,
2357
+ "step": 7850
2358
+ },
2359
+ {
2360
+ "epoch": 25.322580645161292,
2361
+ "eval_loss": 0.13754527270793915,
2362
+ "eval_runtime": 85.9879,
2363
+ "eval_samples_per_second": 221.008,
2364
+ "eval_steps_per_second": 4.605,
2365
+ "step": 7850
2366
+ },
2367
+ {
2368
+ "epoch": 25.483870967741936,
2369
+ "grad_norm": 0.35281285643577576,
2370
+ "learning_rate": 9.807741935483871e-05,
2371
+ "loss": 0.136,
2372
+ "step": 7900
2373
+ },
2374
+ {
2375
+ "epoch": 25.483870967741936,
2376
+ "eval_loss": 0.13788650929927826,
2377
+ "eval_runtime": 86.623,
2378
+ "eval_samples_per_second": 219.387,
2379
+ "eval_steps_per_second": 4.572,
2380
+ "step": 7900
2381
+ },
2382
+ {
2383
+ "epoch": 25.64516129032258,
2384
+ "grad_norm": 0.26881253719329834,
2385
+ "learning_rate": 9.743225806451614e-05,
2386
+ "loss": 0.1376,
2387
+ "step": 7950
2388
+ },
2389
+ {
2390
+ "epoch": 25.64516129032258,
2391
+ "eval_loss": 0.13465990126132965,
2392
+ "eval_runtime": 87.1032,
2393
+ "eval_samples_per_second": 218.178,
2394
+ "eval_steps_per_second": 4.546,
2395
+ "step": 7950
2396
+ },
2397
+ {
2398
+ "epoch": 25.806451612903224,
2399
+ "grad_norm": 0.38799649477005005,
2400
+ "learning_rate": 9.678709677419355e-05,
2401
+ "loss": 0.1365,
2402
+ "step": 8000
2403
+ },
2404
+ {
2405
+ "epoch": 25.806451612903224,
2406
+ "eval_loss": 0.13522003591060638,
2407
+ "eval_runtime": 95.2991,
2408
+ "eval_samples_per_second": 199.414,
2409
+ "eval_steps_per_second": 4.155,
2410
+ "step": 8000
2411
+ },
2412
+ {
2413
+ "epoch": 25.967741935483872,
2414
+ "grad_norm": 0.37531042098999023,
2415
+ "learning_rate": 9.614193548387098e-05,
2416
+ "loss": 0.1362,
2417
+ "step": 8050
2418
+ },
2419
+ {
2420
+ "epoch": 25.967741935483872,
2421
+ "eval_loss": 0.13398829102516174,
2422
+ "eval_runtime": 87.8409,
2423
+ "eval_samples_per_second": 216.346,
2424
+ "eval_steps_per_second": 4.508,
2425
+ "step": 8050
2426
+ },
2427
+ {
2428
+ "epoch": 26.129032258064516,
2429
+ "grad_norm": 0.3436211049556732,
2430
+ "learning_rate": 9.549677419354839e-05,
2431
+ "loss": 0.1342,
2432
+ "step": 8100
2433
+ },
2434
+ {
2435
+ "epoch": 26.129032258064516,
2436
+ "eval_loss": 0.13594096899032593,
2437
+ "eval_runtime": 84.6121,
2438
+ "eval_samples_per_second": 224.602,
2439
+ "eval_steps_per_second": 4.68,
2440
+ "step": 8100
2441
+ },
2442
+ {
2443
+ "epoch": 26.29032258064516,
2444
+ "grad_norm": 0.38407161831855774,
2445
+ "learning_rate": 9.48516129032258e-05,
2446
+ "loss": 0.1322,
2447
+ "step": 8150
2448
+ },
2449
+ {
2450
+ "epoch": 26.29032258064516,
2451
+ "eval_loss": 0.13434267044067383,
2452
+ "eval_runtime": 86.2264,
2453
+ "eval_samples_per_second": 220.396,
2454
+ "eval_steps_per_second": 4.593,
2455
+ "step": 8150
2456
+ },
2457
+ {
2458
+ "epoch": 26.451612903225808,
2459
+ "grad_norm": 0.3329039514064789,
2460
+ "learning_rate": 9.420645161290324e-05,
2461
+ "loss": 0.1338,
2462
+ "step": 8200
2463
+ },
2464
+ {
2465
+ "epoch": 26.451612903225808,
2466
+ "eval_loss": 0.1350966840982437,
2467
+ "eval_runtime": 87.8388,
2468
+ "eval_samples_per_second": 216.351,
2469
+ "eval_steps_per_second": 4.508,
2470
+ "step": 8200
2471
+ },
2472
+ {
2473
+ "epoch": 26.612903225806452,
2474
+ "grad_norm": 0.41340529918670654,
2475
+ "learning_rate": 9.356129032258065e-05,
2476
+ "loss": 0.1362,
2477
+ "step": 8250
2478
+ },
2479
+ {
2480
+ "epoch": 26.612903225806452,
2481
+ "eval_loss": 0.13463687896728516,
2482
+ "eval_runtime": 86.5564,
2483
+ "eval_samples_per_second": 219.556,
2484
+ "eval_steps_per_second": 4.575,
2485
+ "step": 8250
2486
+ },
2487
+ {
2488
+ "epoch": 26.774193548387096,
2489
+ "grad_norm": 0.33343157172203064,
2490
+ "learning_rate": 9.291612903225807e-05,
2491
+ "loss": 0.1331,
2492
+ "step": 8300
2493
+ },
2494
+ {
2495
+ "epoch": 26.774193548387096,
2496
+ "eval_loss": 0.1351860612630844,
2497
+ "eval_runtime": 88.4351,
2498
+ "eval_samples_per_second": 214.892,
2499
+ "eval_steps_per_second": 4.478,
2500
+ "step": 8300
2501
+ },
2502
+ {
2503
+ "epoch": 26.93548387096774,
2504
+ "grad_norm": 0.3529933989048004,
2505
+ "learning_rate": 9.227096774193549e-05,
2506
+ "loss": 0.1306,
2507
+ "step": 8350
2508
+ },
2509
+ {
2510
+ "epoch": 26.93548387096774,
2511
+ "eval_loss": 0.13507899641990662,
2512
+ "eval_runtime": 88.8438,
2513
+ "eval_samples_per_second": 213.904,
2514
+ "eval_steps_per_second": 4.457,
2515
+ "step": 8350
2516
+ },
2517
+ {
2518
+ "epoch": 27.096774193548388,
2519
+ "grad_norm": 0.3433696925640106,
2520
+ "learning_rate": 9.16258064516129e-05,
2521
+ "loss": 0.1339,
2522
+ "step": 8400
2523
+ },
2524
+ {
2525
+ "epoch": 27.096774193548388,
2526
+ "eval_loss": 0.13309802114963531,
2527
+ "eval_runtime": 89.0663,
2528
+ "eval_samples_per_second": 213.369,
2529
+ "eval_steps_per_second": 4.446,
2530
+ "step": 8400
2531
+ },
2532
+ {
2533
+ "epoch": 27.258064516129032,
2534
+ "grad_norm": 0.3371010720729828,
2535
+ "learning_rate": 9.098064516129032e-05,
2536
+ "loss": 0.1315,
2537
+ "step": 8450
2538
+ },
2539
+ {
2540
+ "epoch": 27.258064516129032,
2541
+ "eval_loss": 0.1346307247877121,
2542
+ "eval_runtime": 88.7761,
2543
+ "eval_samples_per_second": 214.067,
2544
+ "eval_steps_per_second": 4.461,
2545
+ "step": 8450
2546
+ },
2547
+ {
2548
+ "epoch": 27.419354838709676,
2549
+ "grad_norm": 0.36965006589889526,
2550
+ "learning_rate": 9.033548387096774e-05,
2551
+ "loss": 0.1349,
2552
+ "step": 8500
2553
+ },
2554
+ {
2555
+ "epoch": 27.419354838709676,
2556
+ "eval_loss": 0.13345304131507874,
2557
+ "eval_runtime": 89.2673,
2558
+ "eval_samples_per_second": 212.889,
2559
+ "eval_steps_per_second": 4.436,
2560
+ "step": 8500
2561
+ },
2562
+ {
2563
+ "epoch": 27.580645161290324,
2564
+ "grad_norm": 0.3361060917377472,
2565
+ "learning_rate": 8.969032258064517e-05,
2566
+ "loss": 0.1323,
2567
+ "step": 8550
2568
+ },
2569
+ {
2570
+ "epoch": 27.580645161290324,
2571
+ "eval_loss": 0.1327604055404663,
2572
+ "eval_runtime": 87.1092,
2573
+ "eval_samples_per_second": 218.163,
2574
+ "eval_steps_per_second": 4.546,
2575
+ "step": 8550
2576
+ },
2577
+ {
2578
+ "epoch": 27.741935483870968,
2579
+ "grad_norm": 0.2936784029006958,
2580
+ "learning_rate": 8.904516129032258e-05,
2581
+ "loss": 0.132,
2582
+ "step": 8600
2583
+ },
2584
+ {
2585
+ "epoch": 27.741935483870968,
2586
+ "eval_loss": 0.1343098133802414,
2587
+ "eval_runtime": 89.5037,
2588
+ "eval_samples_per_second": 212.326,
2589
+ "eval_steps_per_second": 4.424,
2590
+ "step": 8600
2591
+ },
2592
+ {
2593
+ "epoch": 27.903225806451612,
2594
+ "grad_norm": 0.332289457321167,
2595
+ "learning_rate": 8.840000000000001e-05,
2596
+ "loss": 0.1303,
2597
+ "step": 8650
2598
+ },
2599
+ {
2600
+ "epoch": 27.903225806451612,
2601
+ "eval_loss": 0.1309853345155716,
2602
+ "eval_runtime": 88.6535,
2603
+ "eval_samples_per_second": 214.363,
2604
+ "eval_steps_per_second": 4.467,
2605
+ "step": 8650
2606
+ },
2607
+ {
2608
+ "epoch": 28.06451612903226,
2609
+ "grad_norm": 0.3243560791015625,
2610
+ "learning_rate": 8.775483870967742e-05,
2611
+ "loss": 0.1295,
2612
+ "step": 8700
2613
+ },
2614
+ {
2615
+ "epoch": 28.06451612903226,
2616
+ "eval_loss": 0.13385291397571564,
2617
+ "eval_runtime": 89.1997,
2618
+ "eval_samples_per_second": 213.05,
2619
+ "eval_steps_per_second": 4.439,
2620
+ "step": 8700
2621
+ },
2622
+ {
2623
+ "epoch": 28.225806451612904,
2624
+ "grad_norm": 0.2707726061344147,
2625
+ "learning_rate": 8.710967741935485e-05,
2626
+ "loss": 0.1296,
2627
+ "step": 8750
2628
+ },
2629
+ {
2630
+ "epoch": 28.225806451612904,
2631
+ "eval_loss": 0.131495863199234,
2632
+ "eval_runtime": 87.7498,
2633
+ "eval_samples_per_second": 216.57,
2634
+ "eval_steps_per_second": 4.513,
2635
+ "step": 8750
2636
+ },
2637
+ {
2638
+ "epoch": 28.387096774193548,
2639
+ "grad_norm": 0.3294861912727356,
2640
+ "learning_rate": 8.646451612903226e-05,
2641
+ "loss": 0.1309,
2642
+ "step": 8800
2643
+ },
2644
+ {
2645
+ "epoch": 28.387096774193548,
2646
+ "eval_loss": 0.1315741389989853,
2647
+ "eval_runtime": 89.3762,
2648
+ "eval_samples_per_second": 212.629,
2649
+ "eval_steps_per_second": 4.431,
2650
+ "step": 8800
2651
+ },
2652
+ {
2653
+ "epoch": 28.548387096774192,
2654
+ "grad_norm": 0.3417121469974518,
2655
+ "learning_rate": 8.581935483870968e-05,
2656
+ "loss": 0.1259,
2657
+ "step": 8850
2658
+ },
2659
+ {
2660
+ "epoch": 28.548387096774192,
2661
+ "eval_loss": 0.13167841732501984,
2662
+ "eval_runtime": 89.5004,
2663
+ "eval_samples_per_second": 212.334,
2664
+ "eval_steps_per_second": 4.425,
2665
+ "step": 8850
2666
+ },
2667
+ {
2668
+ "epoch": 28.70967741935484,
2669
+ "grad_norm": 0.4122408628463745,
2670
+ "learning_rate": 8.517419354838711e-05,
2671
+ "loss": 0.1309,
2672
+ "step": 8900
2673
+ },
2674
+ {
2675
+ "epoch": 28.70967741935484,
2676
+ "eval_loss": 0.1316230446100235,
2677
+ "eval_runtime": 87.7329,
2678
+ "eval_samples_per_second": 216.612,
2679
+ "eval_steps_per_second": 4.514,
2680
+ "step": 8900
2681
+ },
2682
+ {
2683
+ "epoch": 28.870967741935484,
2684
+ "grad_norm": 0.28204530477523804,
2685
+ "learning_rate": 8.452903225806452e-05,
2686
+ "loss": 0.1275,
2687
+ "step": 8950
2688
+ },
2689
+ {
2690
+ "epoch": 28.870967741935484,
2691
+ "eval_loss": 0.13107319176197052,
2692
+ "eval_runtime": 89.6188,
2693
+ "eval_samples_per_second": 212.054,
2694
+ "eval_steps_per_second": 4.419,
2695
+ "step": 8950
2696
+ },
2697
+ {
2698
+ "epoch": 29.032258064516128,
2699
+ "grad_norm": 0.35629284381866455,
2700
+ "learning_rate": 8.388387096774194e-05,
2701
+ "loss": 0.1288,
2702
+ "step": 9000
2703
+ },
2704
+ {
2705
+ "epoch": 29.032258064516128,
2706
+ "eval_loss": 0.13227057456970215,
2707
+ "eval_runtime": 88.8381,
2708
+ "eval_samples_per_second": 213.917,
2709
+ "eval_steps_per_second": 4.458,
2710
+ "step": 9000
2711
+ },
2712
+ {
2713
+ "epoch": 29.193548387096776,
2714
+ "grad_norm": 0.309741348028183,
2715
+ "learning_rate": 8.323870967741936e-05,
2716
+ "loss": 0.128,
2717
+ "step": 9050
2718
+ },
2719
+ {
2720
+ "epoch": 29.193548387096776,
2721
+ "eval_loss": 0.13167747855186462,
2722
+ "eval_runtime": 88.4882,
2723
+ "eval_samples_per_second": 214.763,
2724
+ "eval_steps_per_second": 4.475,
2725
+ "step": 9050
2726
+ },
2727
+ {
2728
+ "epoch": 29.35483870967742,
2729
+ "grad_norm": 0.288798987865448,
2730
+ "learning_rate": 8.259354838709677e-05,
2731
+ "loss": 0.1267,
2732
+ "step": 9100
2733
+ },
2734
+ {
2735
+ "epoch": 29.35483870967742,
2736
+ "eval_loss": 0.1299341917037964,
2737
+ "eval_runtime": 86.532,
2738
+ "eval_samples_per_second": 219.618,
2739
+ "eval_steps_per_second": 4.576,
2740
+ "step": 9100
2741
+ },
2742
+ {
2743
+ "epoch": 29.516129032258064,
2744
+ "grad_norm": 0.3058416545391083,
2745
+ "learning_rate": 8.19483870967742e-05,
2746
+ "loss": 0.1262,
2747
+ "step": 9150
2748
+ },
2749
+ {
2750
+ "epoch": 29.516129032258064,
2751
+ "eval_loss": 0.1303175687789917,
2752
+ "eval_runtime": 87.4,
2753
+ "eval_samples_per_second": 217.437,
2754
+ "eval_steps_per_second": 4.531,
2755
+ "step": 9150
2756
+ },
2757
+ {
2758
+ "epoch": 29.677419354838708,
2759
+ "grad_norm": 0.357373982667923,
2760
+ "learning_rate": 8.130322580645163e-05,
2761
+ "loss": 0.1317,
2762
+ "step": 9200
2763
+ },
2764
+ {
2765
+ "epoch": 29.677419354838708,
2766
+ "eval_loss": 0.1297539323568344,
2767
+ "eval_runtime": 87.917,
2768
+ "eval_samples_per_second": 216.158,
2769
+ "eval_steps_per_second": 4.504,
2770
+ "step": 9200
2771
+ },
2772
+ {
2773
+ "epoch": 29.838709677419356,
2774
+ "grad_norm": 0.3070197105407715,
2775
+ "learning_rate": 8.065806451612904e-05,
2776
+ "loss": 0.1277,
2777
+ "step": 9250
2778
+ },
2779
+ {
2780
+ "epoch": 29.838709677419356,
2781
+ "eval_loss": 0.12903046607971191,
2782
+ "eval_runtime": 90.6017,
2783
+ "eval_samples_per_second": 209.753,
2784
+ "eval_steps_per_second": 4.371,
2785
+ "step": 9250
2786
+ },
2787
+ {
2788
+ "epoch": 30.0,
2789
+ "grad_norm": 0.36131608486175537,
2790
+ "learning_rate": 8.001290322580646e-05,
2791
+ "loss": 0.1244,
2792
+ "step": 9300
2793
+ },
2794
+ {
2795
+ "epoch": 30.0,
2796
+ "eval_loss": 0.1285567581653595,
2797
+ "eval_runtime": 88.2187,
2798
+ "eval_samples_per_second": 215.419,
2799
+ "eval_steps_per_second": 4.489,
2800
+ "step": 9300
2801
+ },
2802
+ {
2803
+ "epoch": 30.161290322580644,
2804
+ "grad_norm": 0.31005364656448364,
2805
+ "learning_rate": 7.936774193548388e-05,
2806
+ "loss": 0.1274,
2807
+ "step": 9350
2808
+ },
2809
+ {
2810
+ "epoch": 30.161290322580644,
2811
+ "eval_loss": 0.12861816585063934,
2812
+ "eval_runtime": 88.8974,
2813
+ "eval_samples_per_second": 213.775,
2814
+ "eval_steps_per_second": 4.455,
2815
+ "step": 9350
2816
+ },
2817
+ {
2818
+ "epoch": 30.322580645161292,
2819
+ "grad_norm": 0.3450087904930115,
2820
+ "learning_rate": 7.872258064516129e-05,
2821
+ "loss": 0.1315,
2822
+ "step": 9400
2823
+ },
2824
+ {
2825
+ "epoch": 30.322580645161292,
2826
+ "eval_loss": 0.12834839522838593,
2827
+ "eval_runtime": 88.9142,
2828
+ "eval_samples_per_second": 213.734,
2829
+ "eval_steps_per_second": 4.454,
2830
+ "step": 9400
2831
+ },
2832
+ {
2833
+ "epoch": 30.483870967741936,
2834
+ "grad_norm": 0.26987212896347046,
2835
+ "learning_rate": 7.807741935483871e-05,
2836
+ "loss": 0.1228,
2837
+ "step": 9450
2838
+ },
2839
+ {
2840
+ "epoch": 30.483870967741936,
2841
+ "eval_loss": 0.12817350029945374,
2842
+ "eval_runtime": 88.7737,
2843
+ "eval_samples_per_second": 214.073,
2844
+ "eval_steps_per_second": 4.461,
2845
+ "step": 9450
2846
+ },
2847
+ {
2848
+ "epoch": 30.64516129032258,
2849
+ "grad_norm": 0.3717745244503021,
2850
+ "learning_rate": 7.743225806451613e-05,
2851
+ "loss": 0.124,
2852
+ "step": 9500
2853
+ },
2854
+ {
2855
+ "epoch": 30.64516129032258,
2856
+ "eval_loss": 0.13065199553966522,
2857
+ "eval_runtime": 88.4187,
2858
+ "eval_samples_per_second": 214.932,
2859
+ "eval_steps_per_second": 4.479,
2860
+ "step": 9500
2861
+ },
2862
+ {
2863
+ "epoch": 30.806451612903224,
2864
+ "grad_norm": 0.29905572533607483,
2865
+ "learning_rate": 7.678709677419355e-05,
2866
+ "loss": 0.1212,
2867
+ "step": 9550
2868
+ },
2869
+ {
2870
+ "epoch": 30.806451612903224,
2871
+ "eval_loss": 0.1283879578113556,
2872
+ "eval_runtime": 87.9276,
2873
+ "eval_samples_per_second": 216.132,
2874
+ "eval_steps_per_second": 4.504,
2875
+ "step": 9550
2876
+ },
2877
+ {
2878
+ "epoch": 30.967741935483872,
2879
+ "grad_norm": 0.2816069722175598,
2880
+ "learning_rate": 7.614193548387098e-05,
2881
+ "loss": 0.1233,
2882
+ "step": 9600
2883
+ },
2884
+ {
2885
+ "epoch": 30.967741935483872,
2886
+ "eval_loss": 0.12923495471477509,
2887
+ "eval_runtime": 89.2797,
2888
+ "eval_samples_per_second": 212.859,
2889
+ "eval_steps_per_second": 4.436,
2890
+ "step": 9600
2891
+ },
2892
+ {
2893
+ "epoch": 31.129032258064516,
2894
+ "grad_norm": 0.30339810252189636,
2895
+ "learning_rate": 7.549677419354839e-05,
2896
+ "loss": 0.1226,
2897
+ "step": 9650
2898
+ },
2899
+ {
2900
+ "epoch": 31.129032258064516,
2901
+ "eval_loss": 0.12811945378780365,
2902
+ "eval_runtime": 85.4363,
2903
+ "eval_samples_per_second": 222.435,
2904
+ "eval_steps_per_second": 4.635,
2905
+ "step": 9650
2906
+ },
2907
+ {
2908
+ "epoch": 31.29032258064516,
2909
+ "grad_norm": 0.3012908697128296,
2910
+ "learning_rate": 7.48516129032258e-05,
2911
+ "loss": 0.1244,
2912
+ "step": 9700
2913
+ },
2914
+ {
2915
+ "epoch": 31.29032258064516,
2916
+ "eval_loss": 0.12850458920001984,
2917
+ "eval_runtime": 90.0213,
2918
+ "eval_samples_per_second": 211.106,
2919
+ "eval_steps_per_second": 4.399,
2920
+ "step": 9700
2921
+ },
2922
+ {
2923
+ "epoch": 31.451612903225808,
2924
+ "grad_norm": 0.36201730370521545,
2925
+ "learning_rate": 7.420645161290323e-05,
2926
+ "loss": 0.1272,
2927
+ "step": 9750
2928
+ },
2929
+ {
2930
+ "epoch": 31.451612903225808,
2931
+ "eval_loss": 0.12800218164920807,
2932
+ "eval_runtime": 95.4408,
2933
+ "eval_samples_per_second": 199.118,
2934
+ "eval_steps_per_second": 4.149,
2935
+ "step": 9750
2936
+ },
2937
+ {
2938
+ "epoch": 31.612903225806452,
2939
+ "grad_norm": 0.30312639474868774,
2940
+ "learning_rate": 7.356129032258064e-05,
2941
+ "loss": 0.1248,
2942
+ "step": 9800
2943
+ },
2944
+ {
2945
+ "epoch": 31.612903225806452,
2946
+ "eval_loss": 0.12891393899917603,
2947
+ "eval_runtime": 89.5417,
2948
+ "eval_samples_per_second": 212.236,
2949
+ "eval_steps_per_second": 4.423,
2950
+ "step": 9800
2951
+ },
2952
+ {
2953
+ "epoch": 31.774193548387096,
2954
+ "grad_norm": 0.34564414620399475,
2955
+ "learning_rate": 7.291612903225807e-05,
2956
+ "loss": 0.1245,
2957
+ "step": 9850
2958
+ },
2959
+ {
2960
+ "epoch": 31.774193548387096,
2961
+ "eval_loss": 0.12625598907470703,
2962
+ "eval_runtime": 92.1876,
2963
+ "eval_samples_per_second": 206.145,
2964
+ "eval_steps_per_second": 4.296,
2965
+ "step": 9850
2966
+ },
2967
+ {
2968
+ "epoch": 31.93548387096774,
2969
+ "grad_norm": 0.31116342544555664,
2970
+ "learning_rate": 7.22709677419355e-05,
2971
+ "loss": 0.1261,
2972
+ "step": 9900
2973
+ },
2974
+ {
2975
+ "epoch": 31.93548387096774,
2976
+ "eval_loss": 0.1266312599182129,
2977
+ "eval_runtime": 105.6265,
2978
+ "eval_samples_per_second": 179.917,
2979
+ "eval_steps_per_second": 3.749,
2980
+ "step": 9900
2981
+ },
2982
+ {
2983
+ "epoch": 32.096774193548384,
2984
+ "grad_norm": 0.2931393086910248,
2985
+ "learning_rate": 7.16258064516129e-05,
2986
+ "loss": 0.1228,
2987
+ "step": 9950
2988
+ },
2989
+ {
2990
+ "epoch": 32.096774193548384,
2991
+ "eval_loss": 0.12575842440128326,
2992
+ "eval_runtime": 89.3508,
2993
+ "eval_samples_per_second": 212.69,
2994
+ "eval_steps_per_second": 4.432,
2995
+ "step": 9950
2996
+ },
2997
+ {
2998
+ "epoch": 32.25806451612903,
2999
+ "grad_norm": 0.244206041097641,
3000
+ "learning_rate": 7.098064516129033e-05,
3001
+ "loss": 0.1232,
3002
+ "step": 10000
3003
+ },
3004
+ {
3005
+ "epoch": 32.25806451612903,
3006
+ "eval_loss": 0.12638460099697113,
3007
+ "eval_runtime": 101.1824,
3008
+ "eval_samples_per_second": 187.819,
3009
+ "eval_steps_per_second": 3.914,
3010
+ "step": 10000
3011
+ },
3012
+ {
3013
+ "epoch": 32.41935483870968,
3014
+ "grad_norm": 0.2691047489643097,
3015
+ "learning_rate": 7.033548387096774e-05,
3016
+ "loss": 0.1224,
3017
+ "step": 10050
3018
+ },
3019
+ {
3020
+ "epoch": 32.41935483870968,
3021
+ "eval_loss": 0.12849974632263184,
3022
+ "eval_runtime": 91.3103,
3023
+ "eval_samples_per_second": 208.126,
3024
+ "eval_steps_per_second": 4.337,
3025
+ "step": 10050
3026
+ },
3027
+ {
3028
+ "epoch": 32.58064516129032,
3029
+ "grad_norm": 0.30258217453956604,
3030
+ "learning_rate": 6.969032258064516e-05,
3031
+ "loss": 0.1239,
3032
+ "step": 10100
3033
+ },
3034
+ {
3035
+ "epoch": 32.58064516129032,
3036
+ "eval_loss": 0.12675543129444122,
3037
+ "eval_runtime": 87.3987,
3038
+ "eval_samples_per_second": 217.44,
3039
+ "eval_steps_per_second": 4.531,
3040
+ "step": 10100
3041
+ },
3042
+ {
3043
+ "epoch": 32.74193548387097,
3044
+ "grad_norm": 0.2675139009952545,
3045
+ "learning_rate": 6.904516129032258e-05,
3046
+ "loss": 0.1214,
3047
+ "step": 10150
3048
+ },
3049
+ {
3050
+ "epoch": 32.74193548387097,
3051
+ "eval_loss": 0.12617285549640656,
3052
+ "eval_runtime": 87.9466,
3053
+ "eval_samples_per_second": 216.086,
3054
+ "eval_steps_per_second": 4.503,
3055
+ "step": 10150
3056
+ },
3057
+ {
3058
+ "epoch": 32.903225806451616,
3059
+ "grad_norm": 0.3610474467277527,
3060
+ "learning_rate": 6.840000000000001e-05,
3061
+ "loss": 0.1205,
3062
+ "step": 10200
3063
+ },
3064
+ {
3065
+ "epoch": 32.903225806451616,
3066
+ "eval_loss": 0.12886326014995575,
3067
+ "eval_runtime": 88.5505,
3068
+ "eval_samples_per_second": 214.612,
3069
+ "eval_steps_per_second": 4.472,
3070
+ "step": 10200
3071
+ },
3072
+ {
3073
+ "epoch": 33.064516129032256,
3074
+ "grad_norm": 0.3488837480545044,
3075
+ "learning_rate": 6.775483870967742e-05,
3076
+ "loss": 0.1235,
3077
+ "step": 10250
3078
+ },
3079
+ {
3080
+ "epoch": 33.064516129032256,
3081
+ "eval_loss": 0.12522108852863312,
3082
+ "eval_runtime": 86.8524,
3083
+ "eval_samples_per_second": 218.808,
3084
+ "eval_steps_per_second": 4.559,
3085
+ "step": 10250
3086
+ },
3087
+ {
3088
+ "epoch": 33.225806451612904,
3089
+ "grad_norm": 0.31276750564575195,
3090
+ "learning_rate": 6.710967741935485e-05,
3091
+ "loss": 0.1205,
3092
+ "step": 10300
3093
+ },
3094
+ {
3095
+ "epoch": 33.225806451612904,
3096
+ "eval_loss": 0.12656815350055695,
3097
+ "eval_runtime": 88.7072,
3098
+ "eval_samples_per_second": 214.233,
3099
+ "eval_steps_per_second": 4.464,
3100
+ "step": 10300
3101
+ },
3102
+ {
3103
+ "epoch": 33.38709677419355,
3104
+ "grad_norm": 0.2674780488014221,
3105
+ "learning_rate": 6.646451612903226e-05,
3106
+ "loss": 0.1189,
3107
+ "step": 10350
3108
+ },
3109
+ {
3110
+ "epoch": 33.38709677419355,
3111
+ "eval_loss": 0.12716618180274963,
3112
+ "eval_runtime": 86.5042,
3113
+ "eval_samples_per_second": 219.689,
3114
+ "eval_steps_per_second": 4.578,
3115
+ "step": 10350
3116
+ },
3117
+ {
3118
+ "epoch": 33.54838709677419,
3119
+ "grad_norm": 0.35878920555114746,
3120
+ "learning_rate": 6.581935483870969e-05,
3121
+ "loss": 0.1187,
3122
+ "step": 10400
3123
+ },
3124
+ {
3125
+ "epoch": 33.54838709677419,
3126
+ "eval_loss": 0.12553632259368896,
3127
+ "eval_runtime": 89.3045,
3128
+ "eval_samples_per_second": 212.8,
3129
+ "eval_steps_per_second": 4.434,
3130
+ "step": 10400
3131
+ },
3132
+ {
3133
+ "epoch": 33.70967741935484,
3134
+ "grad_norm": 0.3341622054576874,
3135
+ "learning_rate": 6.51741935483871e-05,
3136
+ "loss": 0.1256,
3137
+ "step": 10450
3138
+ },
3139
+ {
3140
+ "epoch": 33.70967741935484,
3141
+ "eval_loss": 0.12624432146549225,
3142
+ "eval_runtime": 88.9284,
3143
+ "eval_samples_per_second": 213.7,
3144
+ "eval_steps_per_second": 4.453,
3145
+ "step": 10450
3146
+ },
3147
+ {
3148
+ "epoch": 33.87096774193548,
3149
+ "grad_norm": 0.30921441316604614,
3150
+ "learning_rate": 6.452903225806451e-05,
3151
+ "loss": 0.1226,
3152
+ "step": 10500
3153
+ },
3154
+ {
3155
+ "epoch": 33.87096774193548,
3156
+ "eval_loss": 0.12660150229930878,
3157
+ "eval_runtime": 89.8795,
3158
+ "eval_samples_per_second": 211.439,
3159
+ "eval_steps_per_second": 4.406,
3160
+ "step": 10500
3161
+ },
3162
+ {
3163
+ "epoch": 34.03225806451613,
3164
+ "grad_norm": 0.39118140935897827,
3165
+ "learning_rate": 6.388387096774194e-05,
3166
+ "loss": 0.1239,
3167
+ "step": 10550
3168
+ },
3169
+ {
3170
+ "epoch": 34.03225806451613,
3171
+ "eval_loss": 0.1243576630949974,
3172
+ "eval_runtime": 89.012,
3173
+ "eval_samples_per_second": 213.499,
3174
+ "eval_steps_per_second": 4.449,
3175
+ "step": 10550
3176
+ },
3177
+ {
3178
+ "epoch": 34.193548387096776,
3179
+ "grad_norm": 0.3129843771457672,
3180
+ "learning_rate": 6.323870967741936e-05,
3181
+ "loss": 0.1227,
3182
+ "step": 10600
3183
+ },
3184
+ {
3185
+ "epoch": 34.193548387096776,
3186
+ "eval_loss": 0.12571550905704498,
3187
+ "eval_runtime": 88.7287,
3188
+ "eval_samples_per_second": 214.181,
3189
+ "eval_steps_per_second": 4.463,
3190
+ "step": 10600
3191
+ },
3192
+ {
3193
+ "epoch": 34.354838709677416,
3194
+ "grad_norm": 0.3165799081325531,
3195
+ "learning_rate": 6.259354838709678e-05,
3196
+ "loss": 0.1176,
3197
+ "step": 10650
3198
+ },
3199
+ {
3200
+ "epoch": 34.354838709677416,
3201
+ "eval_loss": 0.12522710859775543,
3202
+ "eval_runtime": 87.961,
3203
+ "eval_samples_per_second": 216.05,
3204
+ "eval_steps_per_second": 4.502,
3205
+ "step": 10650
3206
+ },
3207
+ {
3208
+ "epoch": 34.516129032258064,
3209
+ "grad_norm": 0.30507832765579224,
3210
+ "learning_rate": 6.19483870967742e-05,
3211
+ "loss": 0.1224,
3212
+ "step": 10700
3213
+ },
3214
+ {
3215
+ "epoch": 34.516129032258064,
3216
+ "eval_loss": 0.1257481724023819,
3217
+ "eval_runtime": 88.8686,
3218
+ "eval_samples_per_second": 213.844,
3219
+ "eval_steps_per_second": 4.456,
3220
+ "step": 10700
3221
+ },
3222
+ {
3223
+ "epoch": 34.67741935483871,
3224
+ "grad_norm": 0.35299909114837646,
3225
+ "learning_rate": 6.130322580645161e-05,
3226
+ "loss": 0.1194,
3227
+ "step": 10750
3228
+ },
3229
+ {
3230
+ "epoch": 34.67741935483871,
3231
+ "eval_loss": 0.12496702373027802,
3232
+ "eval_runtime": 90.6297,
3233
+ "eval_samples_per_second": 209.689,
3234
+ "eval_steps_per_second": 4.369,
3235
+ "step": 10750
3236
+ },
3237
+ {
3238
+ "epoch": 34.83870967741935,
3239
+ "grad_norm": 0.29007554054260254,
3240
+ "learning_rate": 6.065806451612903e-05,
3241
+ "loss": 0.1187,
3242
+ "step": 10800
3243
+ },
3244
+ {
3245
+ "epoch": 34.83870967741935,
3246
+ "eval_loss": 0.12488405406475067,
3247
+ "eval_runtime": 88.9174,
3248
+ "eval_samples_per_second": 213.726,
3249
+ "eval_steps_per_second": 4.454,
3250
+ "step": 10800
3251
+ },
3252
+ {
3253
+ "epoch": 35.0,
3254
+ "grad_norm": 0.48845478892326355,
3255
+ "learning_rate": 6.001290322580645e-05,
3256
+ "loss": 0.1196,
3257
+ "step": 10850
3258
+ },
3259
+ {
3260
+ "epoch": 35.0,
3261
+ "eval_loss": 0.12530028820037842,
3262
+ "eval_runtime": 89.441,
3263
+ "eval_samples_per_second": 212.475,
3264
+ "eval_steps_per_second": 4.428,
3265
+ "step": 10850
3266
+ },
3267
+ {
3268
+ "epoch": 35.16129032258065,
3269
+ "grad_norm": 0.25860869884490967,
3270
+ "learning_rate": 5.936774193548388e-05,
3271
+ "loss": 0.1183,
3272
+ "step": 10900
3273
+ },
3274
+ {
3275
+ "epoch": 35.16129032258065,
3276
+ "eval_loss": 0.12282554060220718,
3277
+ "eval_runtime": 88.2534,
3278
+ "eval_samples_per_second": 215.334,
3279
+ "eval_steps_per_second": 4.487,
3280
+ "step": 10900
3281
+ },
3282
+ {
3283
+ "epoch": 35.32258064516129,
3284
+ "grad_norm": 0.330858051776886,
3285
+ "learning_rate": 5.87225806451613e-05,
3286
+ "loss": 0.1157,
3287
+ "step": 10950
3288
+ },
3289
+ {
3290
+ "epoch": 35.32258064516129,
3291
+ "eval_loss": 0.12600766122341156,
3292
+ "eval_runtime": 88.4405,
3293
+ "eval_samples_per_second": 214.879,
3294
+ "eval_steps_per_second": 4.478,
3295
+ "step": 10950
3296
+ },
3297
+ {
3298
+ "epoch": 35.483870967741936,
3299
+ "grad_norm": 0.2873861789703369,
3300
+ "learning_rate": 5.8077419354838716e-05,
3301
+ "loss": 0.1188,
3302
+ "step": 11000
3303
+ },
3304
+ {
3305
+ "epoch": 35.483870967741936,
3306
+ "eval_loss": 0.12375803291797638,
3307
+ "eval_runtime": 90.5675,
3308
+ "eval_samples_per_second": 209.832,
3309
+ "eval_steps_per_second": 4.372,
3310
+ "step": 11000
3311
+ },
3312
+ {
3313
+ "epoch": 35.645161290322584,
3314
+ "grad_norm": 0.3179507851600647,
3315
+ "learning_rate": 5.743225806451613e-05,
3316
+ "loss": 0.1166,
3317
+ "step": 11050
3318
+ },
3319
+ {
3320
+ "epoch": 35.645161290322584,
3321
+ "eval_loss": 0.12402182072401047,
3322
+ "eval_runtime": 89.7815,
3323
+ "eval_samples_per_second": 211.669,
3324
+ "eval_steps_per_second": 4.411,
3325
+ "step": 11050
3326
+ },
3327
+ {
3328
+ "epoch": 35.806451612903224,
3329
+ "grad_norm": 0.2623940110206604,
3330
+ "learning_rate": 5.678709677419355e-05,
3331
+ "loss": 0.116,
3332
+ "step": 11100
3333
+ },
3334
+ {
3335
+ "epoch": 35.806451612903224,
3336
+ "eval_loss": 0.12476334720849991,
3337
+ "eval_runtime": 89.8543,
3338
+ "eval_samples_per_second": 211.498,
3339
+ "eval_steps_per_second": 4.407,
3340
+ "step": 11100
3341
+ },
3342
+ {
3343
+ "epoch": 35.96774193548387,
3344
+ "grad_norm": 0.2534388601779938,
3345
+ "learning_rate": 5.614193548387097e-05,
3346
+ "loss": 0.1207,
3347
+ "step": 11150
3348
+ },
3349
+ {
3350
+ "epoch": 35.96774193548387,
3351
+ "eval_loss": 0.12570072710514069,
3352
+ "eval_runtime": 91.2008,
3353
+ "eval_samples_per_second": 208.375,
3354
+ "eval_steps_per_second": 4.342,
3355
+ "step": 11150
3356
+ },
3357
+ {
3358
+ "epoch": 36.12903225806452,
3359
+ "grad_norm": 0.30418768525123596,
3360
+ "learning_rate": 5.5496774193548386e-05,
3361
+ "loss": 0.1177,
3362
+ "step": 11200
3363
+ },
3364
+ {
3365
+ "epoch": 36.12903225806452,
3366
+ "eval_loss": 0.12448415905237198,
3367
+ "eval_runtime": 89.3046,
3368
+ "eval_samples_per_second": 212.8,
3369
+ "eval_steps_per_second": 4.434,
3370
+ "step": 11200
3371
+ },
3372
+ {
3373
+ "epoch": 36.29032258064516,
3374
+ "grad_norm": 0.290436327457428,
3375
+ "learning_rate": 5.485161290322581e-05,
3376
+ "loss": 0.1147,
3377
+ "step": 11250
3378
+ },
3379
+ {
3380
+ "epoch": 36.29032258064516,
3381
+ "eval_loss": 0.12383412569761276,
3382
+ "eval_runtime": 91.1386,
3383
+ "eval_samples_per_second": 208.517,
3384
+ "eval_steps_per_second": 4.345,
3385
+ "step": 11250
3386
+ },
3387
+ {
3388
+ "epoch": 36.45161290322581,
3389
+ "grad_norm": 0.3162536919116974,
3390
+ "learning_rate": 5.420645161290323e-05,
3391
+ "loss": 0.1154,
3392
+ "step": 11300
3393
+ },
3394
+ {
3395
+ "epoch": 36.45161290322581,
3396
+ "eval_loss": 0.12289289385080338,
3397
+ "eval_runtime": 90.6693,
3398
+ "eval_samples_per_second": 209.597,
3399
+ "eval_steps_per_second": 4.368,
3400
+ "step": 11300
3401
+ },
3402
+ {
3403
+ "epoch": 36.61290322580645,
3404
+ "grad_norm": 0.3101736903190613,
3405
+ "learning_rate": 5.356129032258065e-05,
3406
+ "loss": 0.1146,
3407
+ "step": 11350
3408
+ },
3409
+ {
3410
+ "epoch": 36.61290322580645,
3411
+ "eval_loss": 0.12234646826982498,
3412
+ "eval_runtime": 89.6047,
3413
+ "eval_samples_per_second": 212.087,
3414
+ "eval_steps_per_second": 4.419,
3415
+ "step": 11350
3416
+ },
3417
+ {
3418
+ "epoch": 36.774193548387096,
3419
+ "grad_norm": 0.32668620347976685,
3420
+ "learning_rate": 5.291612903225806e-05,
3421
+ "loss": 0.1196,
3422
+ "step": 11400
3423
+ },
3424
+ {
3425
+ "epoch": 36.774193548387096,
3426
+ "eval_loss": 0.12317313253879547,
3427
+ "eval_runtime": 88.2522,
3428
+ "eval_samples_per_second": 215.337,
3429
+ "eval_steps_per_second": 4.487,
3430
+ "step": 11400
3431
+ },
3432
+ {
3433
+ "epoch": 36.935483870967744,
3434
+ "grad_norm": 0.3477221429347992,
3435
+ "learning_rate": 5.227096774193548e-05,
3436
+ "loss": 0.1165,
3437
+ "step": 11450
3438
+ },
3439
+ {
3440
+ "epoch": 36.935483870967744,
3441
+ "eval_loss": 0.12321442365646362,
3442
+ "eval_runtime": 90.4924,
3443
+ "eval_samples_per_second": 210.007,
3444
+ "eval_steps_per_second": 4.376,
3445
+ "step": 11450
3446
+ },
3447
+ {
3448
+ "epoch": 37.096774193548384,
3449
+ "grad_norm": 0.3417079448699951,
3450
+ "learning_rate": 5.16258064516129e-05,
3451
+ "loss": 0.1193,
3452
+ "step": 11500
3453
+ },
3454
+ {
3455
+ "epoch": 37.096774193548384,
3456
+ "eval_loss": 0.12433473765850067,
3457
+ "eval_runtime": 87.8702,
3458
+ "eval_samples_per_second": 216.273,
3459
+ "eval_steps_per_second": 4.507,
3460
+ "step": 11500
3461
+ },
3462
+ {
3463
+ "epoch": 37.25806451612903,
3464
+ "grad_norm": 0.4154585301876068,
3465
+ "learning_rate": 5.098064516129033e-05,
3466
+ "loss": 0.1178,
3467
+ "step": 11550
3468
+ },
3469
+ {
3470
+ "epoch": 37.25806451612903,
3471
+ "eval_loss": 0.12312240153551102,
3472
+ "eval_runtime": 87.9067,
3473
+ "eval_samples_per_second": 216.184,
3474
+ "eval_steps_per_second": 4.505,
3475
+ "step": 11550
3476
+ },
3477
+ {
3478
+ "epoch": 37.41935483870968,
3479
+ "grad_norm": 0.3054625988006592,
3480
+ "learning_rate": 5.0335483870967747e-05,
3481
+ "loss": 0.1175,
3482
+ "step": 11600
3483
+ },
3484
+ {
3485
+ "epoch": 37.41935483870968,
3486
+ "eval_loss": 0.12180905044078827,
3487
+ "eval_runtime": 88.3639,
3488
+ "eval_samples_per_second": 215.065,
3489
+ "eval_steps_per_second": 4.481,
3490
+ "step": 11600
3491
+ },
3492
+ {
3493
+ "epoch": 37.58064516129032,
3494
+ "grad_norm": 0.28812453150749207,
3495
+ "learning_rate": 4.9690322580645166e-05,
3496
+ "loss": 0.117,
3497
+ "step": 11650
3498
+ },
3499
+ {
3500
+ "epoch": 37.58064516129032,
3501
+ "eval_loss": 0.12359043955802917,
3502
+ "eval_runtime": 88.3972,
3503
+ "eval_samples_per_second": 214.984,
3504
+ "eval_steps_per_second": 4.48,
3505
+ "step": 11650
3506
+ },
3507
+ {
3508
+ "epoch": 37.74193548387097,
3509
+ "grad_norm": 0.23818424344062805,
3510
+ "learning_rate": 4.9045161290322585e-05,
3511
+ "loss": 0.115,
3512
+ "step": 11700
3513
+ },
3514
+ {
3515
+ "epoch": 37.74193548387097,
3516
+ "eval_loss": 0.12239911407232285,
3517
+ "eval_runtime": 89.3047,
3518
+ "eval_samples_per_second": 212.8,
3519
+ "eval_steps_per_second": 4.434,
3520
+ "step": 11700
3521
+ },
3522
+ {
3523
+ "epoch": 37.903225806451616,
3524
+ "grad_norm": 0.28868499398231506,
3525
+ "learning_rate": 4.8400000000000004e-05,
3526
+ "loss": 0.1176,
3527
+ "step": 11750
3528
+ },
3529
+ {
3530
+ "epoch": 37.903225806451616,
3531
+ "eval_loss": 0.12191120535135269,
3532
+ "eval_runtime": 88.9578,
3533
+ "eval_samples_per_second": 213.629,
3534
+ "eval_steps_per_second": 4.452,
3535
+ "step": 11750
3536
+ },
3537
+ {
3538
+ "epoch": 38.064516129032256,
3539
+ "grad_norm": 0.33930733799934387,
3540
+ "learning_rate": 4.775483870967742e-05,
3541
+ "loss": 0.1145,
3542
+ "step": 11800
3543
+ },
3544
+ {
3545
+ "epoch": 38.064516129032256,
3546
+ "eval_loss": 0.12135030329227448,
3547
+ "eval_runtime": 90.281,
3548
+ "eval_samples_per_second": 210.498,
3549
+ "eval_steps_per_second": 4.386,
3550
+ "step": 11800
3551
+ },
3552
+ {
3553
+ "epoch": 38.225806451612904,
3554
+ "grad_norm": 0.3511495590209961,
3555
+ "learning_rate": 4.710967741935484e-05,
3556
+ "loss": 0.1161,
3557
+ "step": 11850
3558
+ },
3559
+ {
3560
+ "epoch": 38.225806451612904,
3561
+ "eval_loss": 0.12242971360683441,
3562
+ "eval_runtime": 89.1539,
3563
+ "eval_samples_per_second": 213.159,
3564
+ "eval_steps_per_second": 4.442,
3565
+ "step": 11850
3566
+ },
3567
+ {
3568
+ "epoch": 38.38709677419355,
3569
+ "grad_norm": 0.37273716926574707,
3570
+ "learning_rate": 4.646451612903226e-05,
3571
+ "loss": 0.1107,
3572
+ "step": 11900
3573
+ },
3574
+ {
3575
+ "epoch": 38.38709677419355,
3576
+ "eval_loss": 0.12210850417613983,
3577
+ "eval_runtime": 89.9424,
3578
+ "eval_samples_per_second": 211.291,
3579
+ "eval_steps_per_second": 4.403,
3580
+ "step": 11900
3581
+ },
3582
+ {
3583
+ "epoch": 38.54838709677419,
3584
+ "grad_norm": 0.2974016070365906,
3585
+ "learning_rate": 4.5819354838709674e-05,
3586
+ "loss": 0.1135,
3587
+ "step": 11950
3588
+ },
3589
+ {
3590
+ "epoch": 38.54838709677419,
3591
+ "eval_loss": 0.12212313711643219,
3592
+ "eval_runtime": 89.4298,
3593
+ "eval_samples_per_second": 212.502,
3594
+ "eval_steps_per_second": 4.428,
3595
+ "step": 11950
3596
+ },
3597
+ {
3598
+ "epoch": 38.70967741935484,
3599
+ "grad_norm": 0.32708504796028137,
3600
+ "learning_rate": 4.51741935483871e-05,
3601
+ "loss": 0.1167,
3602
+ "step": 12000
3603
+ },
3604
+ {
3605
+ "epoch": 38.70967741935484,
3606
+ "eval_loss": 0.12119368463754654,
3607
+ "eval_runtime": 88.7472,
3608
+ "eval_samples_per_second": 214.136,
3609
+ "eval_steps_per_second": 4.462,
3610
+ "step": 12000
3611
+ }
3612
+ ],
3613
+ "logging_steps": 50,
3614
+ "max_steps": 15500,
3615
+ "num_input_tokens_seen": 0,
3616
+ "num_train_epochs": 50,
3617
+ "save_steps": 1000,
3618
+ "stateful_callbacks": {
3619
+ "TrainerControl": {
3620
+ "args": {
3621
+ "should_epoch_stop": false,
3622
+ "should_evaluate": false,
3623
+ "should_log": false,
3624
+ "should_save": true,
3625
+ "should_training_stop": false
3626
+ },
3627
+ "attributes": {}
3628
+ }
3629
+ },
3630
+ "total_flos": 1.2083464406274048e+17,
3631
+ "train_batch_size": 96,
3632
+ "trial_name": null,
3633
+ "trial_params": null
3634
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d1f34eb3ddd6fe493ce04f41b2277a2e6325e8cc6221eb679f1d3cbc87664c1
3
+ size 15761