katelyndekeer commited on
Commit
01b790d
·
verified ·
1 Parent(s): cf503eb

Upload poetry scorer model

Browse files
checkpoint-4383/config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertForSequenceClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "classifier_dropout": null,
7
+ "gradient_checkpointing": false,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 768,
11
+ "id2label": {
12
+ "0": "LABEL_0",
13
+ "1": "LABEL_1",
14
+ "2": "LABEL_2",
15
+ "3": "LABEL_3",
16
+ "4": "LABEL_4"
17
+ },
18
+ "initializer_range": 0.02,
19
+ "intermediate_size": 3072,
20
+ "label2id": {
21
+ "LABEL_0": 0,
22
+ "LABEL_1": 1,
23
+ "LABEL_2": 2,
24
+ "LABEL_3": 3,
25
+ "LABEL_4": 4
26
+ },
27
+ "layer_norm_eps": 1e-12,
28
+ "max_position_embeddings": 512,
29
+ "model_type": "bert",
30
+ "num_attention_heads": 12,
31
+ "num_hidden_layers": 12,
32
+ "pad_token_id": 0,
33
+ "position_embedding_type": "absolute",
34
+ "problem_type": "single_label_classification",
35
+ "torch_dtype": "float32",
36
+ "transformers_version": "4.51.0",
37
+ "type_vocab_size": 2,
38
+ "use_cache": true,
39
+ "vocab_size": 30522
40
+ }
checkpoint-4383/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aaa4eb908d85996f72b3a3ac68eb9233eec05e7080b358420644dc0bef050fba
3
+ size 437967876
checkpoint-4383/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0880591b213c58de30b90b03001c130eee0681d617f505bc33e63d28f6e526c0
3
+ size 876051194
checkpoint-4383/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f21db73fe0bee1def719bc7ca8ffb87ef9b37418f66c624a172847abfae14e95
3
+ size 13990
checkpoint-4383/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a84471e549a1bcd1ad512fffd13af8ba993316db76245c07a72bfb03ca82ec4d
3
+ size 1064
checkpoint-4383/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
checkpoint-4383/tokenizer_config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": true,
45
+ "cls_token": "[CLS]",
46
+ "do_basic_tokenize": true,
47
+ "do_lower_case": true,
48
+ "extra_special_tokens": {},
49
+ "mask_token": "[MASK]",
50
+ "model_max_length": 512,
51
+ "never_split": null,
52
+ "pad_token": "[PAD]",
53
+ "sep_token": "[SEP]",
54
+ "strip_accents": null,
55
+ "tokenize_chinese_chars": true,
56
+ "tokenizer_class": "BertTokenizer",
57
+ "unk_token": "[UNK]"
58
+ }
checkpoint-4383/trainer_state.json ADDED
@@ -0,0 +1,3100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 3.0,
6
+ "eval_steps": 500,
7
+ "global_step": 4383,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.006844626967830253,
14
+ "grad_norm": 5.76295804977417,
15
+ "learning_rate": 4.989733059548255e-05,
16
+ "loss": 1.6823,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 0.013689253935660506,
21
+ "grad_norm": 4.7698774337768555,
22
+ "learning_rate": 4.9783253479352045e-05,
23
+ "loss": 1.6341,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 0.02053388090349076,
28
+ "grad_norm": 5.027968406677246,
29
+ "learning_rate": 4.966917636322154e-05,
30
+ "loss": 1.6223,
31
+ "step": 30
32
+ },
33
+ {
34
+ "epoch": 0.02737850787132101,
35
+ "grad_norm": 4.3074259757995605,
36
+ "learning_rate": 4.955509924709104e-05,
37
+ "loss": 1.604,
38
+ "step": 40
39
+ },
40
+ {
41
+ "epoch": 0.034223134839151265,
42
+ "grad_norm": 4.665801525115967,
43
+ "learning_rate": 4.9441022130960526e-05,
44
+ "loss": 1.6194,
45
+ "step": 50
46
+ },
47
+ {
48
+ "epoch": 0.04106776180698152,
49
+ "grad_norm": 5.566285133361816,
50
+ "learning_rate": 4.932694501483003e-05,
51
+ "loss": 1.6304,
52
+ "step": 60
53
+ },
54
+ {
55
+ "epoch": 0.04791238877481177,
56
+ "grad_norm": 3.916844606399536,
57
+ "learning_rate": 4.921286789869952e-05,
58
+ "loss": 1.6259,
59
+ "step": 70
60
+ },
61
+ {
62
+ "epoch": 0.05475701574264202,
63
+ "grad_norm": 5.772192478179932,
64
+ "learning_rate": 4.909879078256902e-05,
65
+ "loss": 1.5959,
66
+ "step": 80
67
+ },
68
+ {
69
+ "epoch": 0.061601642710472276,
70
+ "grad_norm": 3.1512539386749268,
71
+ "learning_rate": 4.898471366643852e-05,
72
+ "loss": 1.6301,
73
+ "step": 90
74
+ },
75
+ {
76
+ "epoch": 0.06844626967830253,
77
+ "grad_norm": 4.686239242553711,
78
+ "learning_rate": 4.887063655030801e-05,
79
+ "loss": 1.6057,
80
+ "step": 100
81
+ },
82
+ {
83
+ "epoch": 0.07529089664613278,
84
+ "grad_norm": 5.45793342590332,
85
+ "learning_rate": 4.875655943417751e-05,
86
+ "loss": 1.5979,
87
+ "step": 110
88
+ },
89
+ {
90
+ "epoch": 0.08213552361396304,
91
+ "grad_norm": 4.9296064376831055,
92
+ "learning_rate": 4.8642482318047e-05,
93
+ "loss": 1.6361,
94
+ "step": 120
95
+ },
96
+ {
97
+ "epoch": 0.08898015058179329,
98
+ "grad_norm": 4.5990214347839355,
99
+ "learning_rate": 4.85284052019165e-05,
100
+ "loss": 1.6274,
101
+ "step": 130
102
+ },
103
+ {
104
+ "epoch": 0.09582477754962354,
105
+ "grad_norm": 4.627445220947266,
106
+ "learning_rate": 4.841432808578599e-05,
107
+ "loss": 1.619,
108
+ "step": 140
109
+ },
110
+ {
111
+ "epoch": 0.1026694045174538,
112
+ "grad_norm": 4.408857345581055,
113
+ "learning_rate": 4.8300250969655494e-05,
114
+ "loss": 1.6423,
115
+ "step": 150
116
+ },
117
+ {
118
+ "epoch": 0.10951403148528405,
119
+ "grad_norm": 4.747621059417725,
120
+ "learning_rate": 4.818617385352498e-05,
121
+ "loss": 1.5996,
122
+ "step": 160
123
+ },
124
+ {
125
+ "epoch": 0.1163586584531143,
126
+ "grad_norm": 7.228254318237305,
127
+ "learning_rate": 4.807209673739448e-05,
128
+ "loss": 1.6222,
129
+ "step": 170
130
+ },
131
+ {
132
+ "epoch": 0.12320328542094455,
133
+ "grad_norm": 5.575058937072754,
134
+ "learning_rate": 4.7958019621263975e-05,
135
+ "loss": 1.6084,
136
+ "step": 180
137
+ },
138
+ {
139
+ "epoch": 0.1300479123887748,
140
+ "grad_norm": 3.19671893119812,
141
+ "learning_rate": 4.784394250513347e-05,
142
+ "loss": 1.6149,
143
+ "step": 190
144
+ },
145
+ {
146
+ "epoch": 0.13689253935660506,
147
+ "grad_norm": 5.1137189865112305,
148
+ "learning_rate": 4.772986538900297e-05,
149
+ "loss": 1.6063,
150
+ "step": 200
151
+ },
152
+ {
153
+ "epoch": 0.1437371663244353,
154
+ "grad_norm": 4.85434103012085,
155
+ "learning_rate": 4.761578827287246e-05,
156
+ "loss": 1.6275,
157
+ "step": 210
158
+ },
159
+ {
160
+ "epoch": 0.15058179329226556,
161
+ "grad_norm": 5.245377063751221,
162
+ "learning_rate": 4.7501711156741966e-05,
163
+ "loss": 1.6449,
164
+ "step": 220
165
+ },
166
+ {
167
+ "epoch": 0.15742642026009582,
168
+ "grad_norm": 4.98168420791626,
169
+ "learning_rate": 4.7387634040611455e-05,
170
+ "loss": 1.6059,
171
+ "step": 230
172
+ },
173
+ {
174
+ "epoch": 0.16427104722792607,
175
+ "grad_norm": 3.8005590438842773,
176
+ "learning_rate": 4.727355692448095e-05,
177
+ "loss": 1.6109,
178
+ "step": 240
179
+ },
180
+ {
181
+ "epoch": 0.17111567419575632,
182
+ "grad_norm": 6.513299942016602,
183
+ "learning_rate": 4.715947980835045e-05,
184
+ "loss": 1.6528,
185
+ "step": 250
186
+ },
187
+ {
188
+ "epoch": 0.17796030116358658,
189
+ "grad_norm": 10.949817657470703,
190
+ "learning_rate": 4.704540269221994e-05,
191
+ "loss": 1.5799,
192
+ "step": 260
193
+ },
194
+ {
195
+ "epoch": 0.18480492813141683,
196
+ "grad_norm": 7.446742534637451,
197
+ "learning_rate": 4.693132557608944e-05,
198
+ "loss": 1.6679,
199
+ "step": 270
200
+ },
201
+ {
202
+ "epoch": 0.19164955509924708,
203
+ "grad_norm": 8.343230247497559,
204
+ "learning_rate": 4.6817248459958935e-05,
205
+ "loss": 1.6584,
206
+ "step": 280
207
+ },
208
+ {
209
+ "epoch": 0.19849418206707733,
210
+ "grad_norm": 5.609045028686523,
211
+ "learning_rate": 4.6703171343828425e-05,
212
+ "loss": 1.6863,
213
+ "step": 290
214
+ },
215
+ {
216
+ "epoch": 0.2053388090349076,
217
+ "grad_norm": 4.078282356262207,
218
+ "learning_rate": 4.658909422769793e-05,
219
+ "loss": 1.6902,
220
+ "step": 300
221
+ },
222
+ {
223
+ "epoch": 0.21218343600273784,
224
+ "grad_norm": 7.373465538024902,
225
+ "learning_rate": 4.6475017111567424e-05,
226
+ "loss": 1.6334,
227
+ "step": 310
228
+ },
229
+ {
230
+ "epoch": 0.2190280629705681,
231
+ "grad_norm": 6.651344299316406,
232
+ "learning_rate": 4.636093999543692e-05,
233
+ "loss": 1.6344,
234
+ "step": 320
235
+ },
236
+ {
237
+ "epoch": 0.22587268993839835,
238
+ "grad_norm": 5.423673152923584,
239
+ "learning_rate": 4.6246862879306416e-05,
240
+ "loss": 1.6429,
241
+ "step": 330
242
+ },
243
+ {
244
+ "epoch": 0.2327173169062286,
245
+ "grad_norm": 8.67670726776123,
246
+ "learning_rate": 4.6132785763175905e-05,
247
+ "loss": 1.6337,
248
+ "step": 340
249
+ },
250
+ {
251
+ "epoch": 0.23956194387405885,
252
+ "grad_norm": 8.004029273986816,
253
+ "learning_rate": 4.601870864704541e-05,
254
+ "loss": 1.6283,
255
+ "step": 350
256
+ },
257
+ {
258
+ "epoch": 0.2464065708418891,
259
+ "grad_norm": 7.463047504425049,
260
+ "learning_rate": 4.59046315309149e-05,
261
+ "loss": 1.663,
262
+ "step": 360
263
+ },
264
+ {
265
+ "epoch": 0.2532511978097194,
266
+ "grad_norm": 7.124159336090088,
267
+ "learning_rate": 4.57905544147844e-05,
268
+ "loss": 1.5997,
269
+ "step": 370
270
+ },
271
+ {
272
+ "epoch": 0.2600958247775496,
273
+ "grad_norm": 5.7610602378845215,
274
+ "learning_rate": 4.567647729865389e-05,
275
+ "loss": 1.6273,
276
+ "step": 380
277
+ },
278
+ {
279
+ "epoch": 0.2669404517453799,
280
+ "grad_norm": 9.745051383972168,
281
+ "learning_rate": 4.556240018252339e-05,
282
+ "loss": 1.6288,
283
+ "step": 390
284
+ },
285
+ {
286
+ "epoch": 0.2737850787132101,
287
+ "grad_norm": 5.465712547302246,
288
+ "learning_rate": 4.544832306639288e-05,
289
+ "loss": 1.6221,
290
+ "step": 400
291
+ },
292
+ {
293
+ "epoch": 0.2806297056810404,
294
+ "grad_norm": 5.518365383148193,
295
+ "learning_rate": 4.533424595026238e-05,
296
+ "loss": 1.6647,
297
+ "step": 410
298
+ },
299
+ {
300
+ "epoch": 0.2874743326488706,
301
+ "grad_norm": 5.222605228424072,
302
+ "learning_rate": 4.5220168834131873e-05,
303
+ "loss": 1.6047,
304
+ "step": 420
305
+ },
306
+ {
307
+ "epoch": 0.2943189596167009,
308
+ "grad_norm": 3.868781805038452,
309
+ "learning_rate": 4.510609171800137e-05,
310
+ "loss": 1.6516,
311
+ "step": 430
312
+ },
313
+ {
314
+ "epoch": 0.30116358658453113,
315
+ "grad_norm": 8.720216751098633,
316
+ "learning_rate": 4.4992014601870866e-05,
317
+ "loss": 1.6237,
318
+ "step": 440
319
+ },
320
+ {
321
+ "epoch": 0.3080082135523614,
322
+ "grad_norm": 5.4389872550964355,
323
+ "learning_rate": 4.487793748574036e-05,
324
+ "loss": 1.6307,
325
+ "step": 450
326
+ },
327
+ {
328
+ "epoch": 0.31485284052019163,
329
+ "grad_norm": 7.961674690246582,
330
+ "learning_rate": 4.476386036960986e-05,
331
+ "loss": 1.6018,
332
+ "step": 460
333
+ },
334
+ {
335
+ "epoch": 0.3216974674880219,
336
+ "grad_norm": 6.320678234100342,
337
+ "learning_rate": 4.4649783253479354e-05,
338
+ "loss": 1.6291,
339
+ "step": 470
340
+ },
341
+ {
342
+ "epoch": 0.32854209445585214,
343
+ "grad_norm": 6.27506160736084,
344
+ "learning_rate": 4.453570613734885e-05,
345
+ "loss": 1.5799,
346
+ "step": 480
347
+ },
348
+ {
349
+ "epoch": 0.3353867214236824,
350
+ "grad_norm": 5.602056503295898,
351
+ "learning_rate": 4.4421629021218346e-05,
352
+ "loss": 1.6632,
353
+ "step": 490
354
+ },
355
+ {
356
+ "epoch": 0.34223134839151265,
357
+ "grad_norm": 5.55792760848999,
358
+ "learning_rate": 4.430755190508784e-05,
359
+ "loss": 1.631,
360
+ "step": 500
361
+ },
362
+ {
363
+ "epoch": 0.3490759753593429,
364
+ "grad_norm": 7.593416213989258,
365
+ "learning_rate": 4.419347478895734e-05,
366
+ "loss": 1.6512,
367
+ "step": 510
368
+ },
369
+ {
370
+ "epoch": 0.35592060232717315,
371
+ "grad_norm": 5.704894542694092,
372
+ "learning_rate": 4.4079397672826834e-05,
373
+ "loss": 1.6448,
374
+ "step": 520
375
+ },
376
+ {
377
+ "epoch": 0.36276522929500343,
378
+ "grad_norm": 7.302018642425537,
379
+ "learning_rate": 4.396532055669633e-05,
380
+ "loss": 1.6276,
381
+ "step": 530
382
+ },
383
+ {
384
+ "epoch": 0.36960985626283366,
385
+ "grad_norm": 6.3272857666015625,
386
+ "learning_rate": 4.3851243440565826e-05,
387
+ "loss": 1.601,
388
+ "step": 540
389
+ },
390
+ {
391
+ "epoch": 0.37645448323066394,
392
+ "grad_norm": 3.699582815170288,
393
+ "learning_rate": 4.373716632443532e-05,
394
+ "loss": 1.6147,
395
+ "step": 550
396
+ },
397
+ {
398
+ "epoch": 0.38329911019849416,
399
+ "grad_norm": 9.387665748596191,
400
+ "learning_rate": 4.362308920830482e-05,
401
+ "loss": 1.6616,
402
+ "step": 560
403
+ },
404
+ {
405
+ "epoch": 0.39014373716632444,
406
+ "grad_norm": 6.073903560638428,
407
+ "learning_rate": 4.3509012092174314e-05,
408
+ "loss": 1.6294,
409
+ "step": 570
410
+ },
411
+ {
412
+ "epoch": 0.39698836413415467,
413
+ "grad_norm": 6.1276044845581055,
414
+ "learning_rate": 4.3394934976043803e-05,
415
+ "loss": 1.6343,
416
+ "step": 580
417
+ },
418
+ {
419
+ "epoch": 0.40383299110198495,
420
+ "grad_norm": 5.212673664093018,
421
+ "learning_rate": 4.3280857859913306e-05,
422
+ "loss": 1.5796,
423
+ "step": 590
424
+ },
425
+ {
426
+ "epoch": 0.4106776180698152,
427
+ "grad_norm": 5.6972527503967285,
428
+ "learning_rate": 4.3166780743782796e-05,
429
+ "loss": 1.6342,
430
+ "step": 600
431
+ },
432
+ {
433
+ "epoch": 0.41752224503764546,
434
+ "grad_norm": 5.296470642089844,
435
+ "learning_rate": 4.30527036276523e-05,
436
+ "loss": 1.5852,
437
+ "step": 610
438
+ },
439
+ {
440
+ "epoch": 0.4243668720054757,
441
+ "grad_norm": 7.139956951141357,
442
+ "learning_rate": 4.293862651152179e-05,
443
+ "loss": 1.55,
444
+ "step": 620
445
+ },
446
+ {
447
+ "epoch": 0.43121149897330596,
448
+ "grad_norm": 5.106082916259766,
449
+ "learning_rate": 4.282454939539129e-05,
450
+ "loss": 1.62,
451
+ "step": 630
452
+ },
453
+ {
454
+ "epoch": 0.4380561259411362,
455
+ "grad_norm": 3.9289450645446777,
456
+ "learning_rate": 4.271047227926078e-05,
457
+ "loss": 1.6698,
458
+ "step": 640
459
+ },
460
+ {
461
+ "epoch": 0.44490075290896647,
462
+ "grad_norm": 12.083281517028809,
463
+ "learning_rate": 4.2596395163130276e-05,
464
+ "loss": 1.6081,
465
+ "step": 650
466
+ },
467
+ {
468
+ "epoch": 0.4517453798767967,
469
+ "grad_norm": 9.841049194335938,
470
+ "learning_rate": 4.248231804699977e-05,
471
+ "loss": 1.6394,
472
+ "step": 660
473
+ },
474
+ {
475
+ "epoch": 0.458590006844627,
476
+ "grad_norm": 5.5694122314453125,
477
+ "learning_rate": 4.236824093086927e-05,
478
+ "loss": 1.6091,
479
+ "step": 670
480
+ },
481
+ {
482
+ "epoch": 0.4654346338124572,
483
+ "grad_norm": 4.611788749694824,
484
+ "learning_rate": 4.225416381473877e-05,
485
+ "loss": 1.6093,
486
+ "step": 680
487
+ },
488
+ {
489
+ "epoch": 0.4722792607802875,
490
+ "grad_norm": 6.967776775360107,
491
+ "learning_rate": 4.214008669860826e-05,
492
+ "loss": 1.5892,
493
+ "step": 690
494
+ },
495
+ {
496
+ "epoch": 0.4791238877481177,
497
+ "grad_norm": 6.384361743927002,
498
+ "learning_rate": 4.2026009582477756e-05,
499
+ "loss": 1.6947,
500
+ "step": 700
501
+ },
502
+ {
503
+ "epoch": 0.485968514715948,
504
+ "grad_norm": 9.01130485534668,
505
+ "learning_rate": 4.191193246634725e-05,
506
+ "loss": 1.5663,
507
+ "step": 710
508
+ },
509
+ {
510
+ "epoch": 0.4928131416837782,
511
+ "grad_norm": 5.1406168937683105,
512
+ "learning_rate": 4.179785535021675e-05,
513
+ "loss": 1.6395,
514
+ "step": 720
515
+ },
516
+ {
517
+ "epoch": 0.4996577686516085,
518
+ "grad_norm": 6.612627983093262,
519
+ "learning_rate": 4.1683778234086244e-05,
520
+ "loss": 1.6001,
521
+ "step": 730
522
+ },
523
+ {
524
+ "epoch": 0.5065023956194388,
525
+ "grad_norm": 6.028298854827881,
526
+ "learning_rate": 4.156970111795574e-05,
527
+ "loss": 1.6807,
528
+ "step": 740
529
+ },
530
+ {
531
+ "epoch": 0.5133470225872689,
532
+ "grad_norm": 6.188920021057129,
533
+ "learning_rate": 4.1455624001825236e-05,
534
+ "loss": 1.6086,
535
+ "step": 750
536
+ },
537
+ {
538
+ "epoch": 0.5201916495550992,
539
+ "grad_norm": 8.069817543029785,
540
+ "learning_rate": 4.134154688569473e-05,
541
+ "loss": 1.6,
542
+ "step": 760
543
+ },
544
+ {
545
+ "epoch": 0.5270362765229295,
546
+ "grad_norm": 7.538729190826416,
547
+ "learning_rate": 4.122746976956423e-05,
548
+ "loss": 1.6065,
549
+ "step": 770
550
+ },
551
+ {
552
+ "epoch": 0.5338809034907598,
553
+ "grad_norm": 8.341115951538086,
554
+ "learning_rate": 4.1113392653433725e-05,
555
+ "loss": 1.6016,
556
+ "step": 780
557
+ },
558
+ {
559
+ "epoch": 0.54072553045859,
560
+ "grad_norm": 8.315614700317383,
561
+ "learning_rate": 4.099931553730322e-05,
562
+ "loss": 1.6568,
563
+ "step": 790
564
+ },
565
+ {
566
+ "epoch": 0.5475701574264202,
567
+ "grad_norm": 4.770896911621094,
568
+ "learning_rate": 4.088523842117272e-05,
569
+ "loss": 1.5996,
570
+ "step": 800
571
+ },
572
+ {
573
+ "epoch": 0.5544147843942505,
574
+ "grad_norm": 4.912385940551758,
575
+ "learning_rate": 4.077116130504221e-05,
576
+ "loss": 1.6347,
577
+ "step": 810
578
+ },
579
+ {
580
+ "epoch": 0.5612594113620808,
581
+ "grad_norm": 6.05177116394043,
582
+ "learning_rate": 4.06570841889117e-05,
583
+ "loss": 1.5761,
584
+ "step": 820
585
+ },
586
+ {
587
+ "epoch": 0.568104038329911,
588
+ "grad_norm": 6.568109512329102,
589
+ "learning_rate": 4.0543007072781205e-05,
590
+ "loss": 1.6188,
591
+ "step": 830
592
+ },
593
+ {
594
+ "epoch": 0.5749486652977412,
595
+ "grad_norm": 10.2117919921875,
596
+ "learning_rate": 4.0428929956650694e-05,
597
+ "loss": 1.6409,
598
+ "step": 840
599
+ },
600
+ {
601
+ "epoch": 0.5817932922655715,
602
+ "grad_norm": 4.862828731536865,
603
+ "learning_rate": 4.03148528405202e-05,
604
+ "loss": 1.5973,
605
+ "step": 850
606
+ },
607
+ {
608
+ "epoch": 0.5886379192334018,
609
+ "grad_norm": 3.6554880142211914,
610
+ "learning_rate": 4.0200775724389686e-05,
611
+ "loss": 1.5519,
612
+ "step": 860
613
+ },
614
+ {
615
+ "epoch": 0.5954825462012321,
616
+ "grad_norm": 3.6871445178985596,
617
+ "learning_rate": 4.008669860825919e-05,
618
+ "loss": 1.5939,
619
+ "step": 870
620
+ },
621
+ {
622
+ "epoch": 0.6023271731690623,
623
+ "grad_norm": 5.883737564086914,
624
+ "learning_rate": 3.997262149212868e-05,
625
+ "loss": 1.582,
626
+ "step": 880
627
+ },
628
+ {
629
+ "epoch": 0.6091718001368925,
630
+ "grad_norm": 4.108626842498779,
631
+ "learning_rate": 3.9858544375998174e-05,
632
+ "loss": 1.509,
633
+ "step": 890
634
+ },
635
+ {
636
+ "epoch": 0.6160164271047228,
637
+ "grad_norm": 9.638139724731445,
638
+ "learning_rate": 3.974446725986767e-05,
639
+ "loss": 1.6514,
640
+ "step": 900
641
+ },
642
+ {
643
+ "epoch": 0.6228610540725531,
644
+ "grad_norm": 8.981685638427734,
645
+ "learning_rate": 3.9630390143737166e-05,
646
+ "loss": 1.6001,
647
+ "step": 910
648
+ },
649
+ {
650
+ "epoch": 0.6297056810403833,
651
+ "grad_norm": 6.81504487991333,
652
+ "learning_rate": 3.951631302760667e-05,
653
+ "loss": 1.5607,
654
+ "step": 920
655
+ },
656
+ {
657
+ "epoch": 0.6365503080082136,
658
+ "grad_norm": 6.192551612854004,
659
+ "learning_rate": 3.940223591147616e-05,
660
+ "loss": 1.5551,
661
+ "step": 930
662
+ },
663
+ {
664
+ "epoch": 0.6433949349760438,
665
+ "grad_norm": 7.258768081665039,
666
+ "learning_rate": 3.9288158795345655e-05,
667
+ "loss": 1.6455,
668
+ "step": 940
669
+ },
670
+ {
671
+ "epoch": 0.6502395619438741,
672
+ "grad_norm": 4.096240997314453,
673
+ "learning_rate": 3.917408167921515e-05,
674
+ "loss": 1.5765,
675
+ "step": 950
676
+ },
677
+ {
678
+ "epoch": 0.6570841889117043,
679
+ "grad_norm": 6.833820819854736,
680
+ "learning_rate": 3.906000456308465e-05,
681
+ "loss": 1.6324,
682
+ "step": 960
683
+ },
684
+ {
685
+ "epoch": 0.6639288158795346,
686
+ "grad_norm": 4.35950231552124,
687
+ "learning_rate": 3.894592744695414e-05,
688
+ "loss": 1.5625,
689
+ "step": 970
690
+ },
691
+ {
692
+ "epoch": 0.6707734428473648,
693
+ "grad_norm": 9.174184799194336,
694
+ "learning_rate": 3.883185033082364e-05,
695
+ "loss": 1.5744,
696
+ "step": 980
697
+ },
698
+ {
699
+ "epoch": 0.6776180698151951,
700
+ "grad_norm": 6.108720302581787,
701
+ "learning_rate": 3.8717773214693135e-05,
702
+ "loss": 1.5245,
703
+ "step": 990
704
+ },
705
+ {
706
+ "epoch": 0.6844626967830253,
707
+ "grad_norm": 8.824295043945312,
708
+ "learning_rate": 3.860369609856263e-05,
709
+ "loss": 1.5643,
710
+ "step": 1000
711
+ },
712
+ {
713
+ "epoch": 0.6913073237508556,
714
+ "grad_norm": 6.103463649749756,
715
+ "learning_rate": 3.848961898243213e-05,
716
+ "loss": 1.5271,
717
+ "step": 1010
718
+ },
719
+ {
720
+ "epoch": 0.6981519507186859,
721
+ "grad_norm": 6.505345344543457,
722
+ "learning_rate": 3.837554186630162e-05,
723
+ "loss": 1.6373,
724
+ "step": 1020
725
+ },
726
+ {
727
+ "epoch": 0.7049965776865161,
728
+ "grad_norm": 4.757425785064697,
729
+ "learning_rate": 3.826146475017112e-05,
730
+ "loss": 1.4934,
731
+ "step": 1030
732
+ },
733
+ {
734
+ "epoch": 0.7118412046543463,
735
+ "grad_norm": 12.342533111572266,
736
+ "learning_rate": 3.8147387634040615e-05,
737
+ "loss": 1.6166,
738
+ "step": 1040
739
+ },
740
+ {
741
+ "epoch": 0.7186858316221766,
742
+ "grad_norm": 4.961431503295898,
743
+ "learning_rate": 3.803331051791011e-05,
744
+ "loss": 1.575,
745
+ "step": 1050
746
+ },
747
+ {
748
+ "epoch": 0.7255304585900069,
749
+ "grad_norm": 6.217925071716309,
750
+ "learning_rate": 3.79192334017796e-05,
751
+ "loss": 1.5436,
752
+ "step": 1060
753
+ },
754
+ {
755
+ "epoch": 0.7323750855578371,
756
+ "grad_norm": 9.116820335388184,
757
+ "learning_rate": 3.78051562856491e-05,
758
+ "loss": 1.6174,
759
+ "step": 1070
760
+ },
761
+ {
762
+ "epoch": 0.7392197125256673,
763
+ "grad_norm": 10.522767066955566,
764
+ "learning_rate": 3.769107916951859e-05,
765
+ "loss": 1.5167,
766
+ "step": 1080
767
+ },
768
+ {
769
+ "epoch": 0.7460643394934976,
770
+ "grad_norm": 5.094446659088135,
771
+ "learning_rate": 3.7577002053388095e-05,
772
+ "loss": 1.6682,
773
+ "step": 1090
774
+ },
775
+ {
776
+ "epoch": 0.7529089664613279,
777
+ "grad_norm": 5.020815849304199,
778
+ "learning_rate": 3.7462924937257585e-05,
779
+ "loss": 1.598,
780
+ "step": 1100
781
+ },
782
+ {
783
+ "epoch": 0.7597535934291582,
784
+ "grad_norm": 6.465487957000732,
785
+ "learning_rate": 3.734884782112709e-05,
786
+ "loss": 1.6022,
787
+ "step": 1110
788
+ },
789
+ {
790
+ "epoch": 0.7665982203969883,
791
+ "grad_norm": 8.065593719482422,
792
+ "learning_rate": 3.723477070499658e-05,
793
+ "loss": 1.5861,
794
+ "step": 1120
795
+ },
796
+ {
797
+ "epoch": 0.7734428473648186,
798
+ "grad_norm": 8.32030963897705,
799
+ "learning_rate": 3.712069358886607e-05,
800
+ "loss": 1.5812,
801
+ "step": 1130
802
+ },
803
+ {
804
+ "epoch": 0.7802874743326489,
805
+ "grad_norm": 13.515933990478516,
806
+ "learning_rate": 3.7006616472735576e-05,
807
+ "loss": 1.5541,
808
+ "step": 1140
809
+ },
810
+ {
811
+ "epoch": 0.7871321013004792,
812
+ "grad_norm": 4.896505355834961,
813
+ "learning_rate": 3.6892539356605065e-05,
814
+ "loss": 1.5803,
815
+ "step": 1150
816
+ },
817
+ {
818
+ "epoch": 0.7939767282683093,
819
+ "grad_norm": 7.793066024780273,
820
+ "learning_rate": 3.677846224047457e-05,
821
+ "loss": 1.5494,
822
+ "step": 1160
823
+ },
824
+ {
825
+ "epoch": 0.8008213552361396,
826
+ "grad_norm": 6.854223251342773,
827
+ "learning_rate": 3.666438512434406e-05,
828
+ "loss": 1.5534,
829
+ "step": 1170
830
+ },
831
+ {
832
+ "epoch": 0.8076659822039699,
833
+ "grad_norm": 7.112792491912842,
834
+ "learning_rate": 3.655030800821355e-05,
835
+ "loss": 1.4964,
836
+ "step": 1180
837
+ },
838
+ {
839
+ "epoch": 0.8145106091718002,
840
+ "grad_norm": 10.153095245361328,
841
+ "learning_rate": 3.643623089208305e-05,
842
+ "loss": 1.5004,
843
+ "step": 1190
844
+ },
845
+ {
846
+ "epoch": 0.8213552361396304,
847
+ "grad_norm": 10.011958122253418,
848
+ "learning_rate": 3.6322153775952545e-05,
849
+ "loss": 1.7269,
850
+ "step": 1200
851
+ },
852
+ {
853
+ "epoch": 0.8281998631074606,
854
+ "grad_norm": 11.939464569091797,
855
+ "learning_rate": 3.620807665982204e-05,
856
+ "loss": 1.6221,
857
+ "step": 1210
858
+ },
859
+ {
860
+ "epoch": 0.8350444900752909,
861
+ "grad_norm": 6.905048370361328,
862
+ "learning_rate": 3.609399954369154e-05,
863
+ "loss": 1.567,
864
+ "step": 1220
865
+ },
866
+ {
867
+ "epoch": 0.8418891170431212,
868
+ "grad_norm": 8.90895938873291,
869
+ "learning_rate": 3.597992242756103e-05,
870
+ "loss": 1.5438,
871
+ "step": 1230
872
+ },
873
+ {
874
+ "epoch": 0.8487337440109514,
875
+ "grad_norm": 5.8418779373168945,
876
+ "learning_rate": 3.586584531143053e-05,
877
+ "loss": 1.4932,
878
+ "step": 1240
879
+ },
880
+ {
881
+ "epoch": 0.8555783709787816,
882
+ "grad_norm": 15.751609802246094,
883
+ "learning_rate": 3.5751768195300025e-05,
884
+ "loss": 1.4868,
885
+ "step": 1250
886
+ },
887
+ {
888
+ "epoch": 0.8624229979466119,
889
+ "grad_norm": 7.625329494476318,
890
+ "learning_rate": 3.563769107916952e-05,
891
+ "loss": 1.5348,
892
+ "step": 1260
893
+ },
894
+ {
895
+ "epoch": 0.8692676249144422,
896
+ "grad_norm": 7.92191219329834,
897
+ "learning_rate": 3.552361396303902e-05,
898
+ "loss": 1.4559,
899
+ "step": 1270
900
+ },
901
+ {
902
+ "epoch": 0.8761122518822724,
903
+ "grad_norm": 8.768624305725098,
904
+ "learning_rate": 3.5409536846908514e-05,
905
+ "loss": 1.4969,
906
+ "step": 1280
907
+ },
908
+ {
909
+ "epoch": 0.8829568788501027,
910
+ "grad_norm": 5.574611186981201,
911
+ "learning_rate": 3.529545973077801e-05,
912
+ "loss": 1.5616,
913
+ "step": 1290
914
+ },
915
+ {
916
+ "epoch": 0.8898015058179329,
917
+ "grad_norm": 7.431711196899414,
918
+ "learning_rate": 3.51813826146475e-05,
919
+ "loss": 1.5569,
920
+ "step": 1300
921
+ },
922
+ {
923
+ "epoch": 0.8966461327857632,
924
+ "grad_norm": 7.5232086181640625,
925
+ "learning_rate": 3.5067305498517e-05,
926
+ "loss": 1.5393,
927
+ "step": 1310
928
+ },
929
+ {
930
+ "epoch": 0.9034907597535934,
931
+ "grad_norm": 9.488703727722168,
932
+ "learning_rate": 3.495322838238649e-05,
933
+ "loss": 1.5632,
934
+ "step": 1320
935
+ },
936
+ {
937
+ "epoch": 0.9103353867214237,
938
+ "grad_norm": 9.230395317077637,
939
+ "learning_rate": 3.4839151266255994e-05,
940
+ "loss": 1.5002,
941
+ "step": 1330
942
+ },
943
+ {
944
+ "epoch": 0.917180013689254,
945
+ "grad_norm": 9.00788402557373,
946
+ "learning_rate": 3.472507415012548e-05,
947
+ "loss": 1.4862,
948
+ "step": 1340
949
+ },
950
+ {
951
+ "epoch": 0.9240246406570842,
952
+ "grad_norm": 7.389601707458496,
953
+ "learning_rate": 3.461099703399498e-05,
954
+ "loss": 1.5165,
955
+ "step": 1350
956
+ },
957
+ {
958
+ "epoch": 0.9308692676249144,
959
+ "grad_norm": 3.9939334392547607,
960
+ "learning_rate": 3.449691991786448e-05,
961
+ "loss": 1.5359,
962
+ "step": 1360
963
+ },
964
+ {
965
+ "epoch": 0.9377138945927447,
966
+ "grad_norm": 10.2373628616333,
967
+ "learning_rate": 3.438284280173397e-05,
968
+ "loss": 1.5256,
969
+ "step": 1370
970
+ },
971
+ {
972
+ "epoch": 0.944558521560575,
973
+ "grad_norm": 8.441229820251465,
974
+ "learning_rate": 3.4268765685603474e-05,
975
+ "loss": 1.5307,
976
+ "step": 1380
977
+ },
978
+ {
979
+ "epoch": 0.9514031485284052,
980
+ "grad_norm": 9.391007423400879,
981
+ "learning_rate": 3.415468856947296e-05,
982
+ "loss": 1.6031,
983
+ "step": 1390
984
+ },
985
+ {
986
+ "epoch": 0.9582477754962354,
987
+ "grad_norm": 5.623960971832275,
988
+ "learning_rate": 3.4040611453342466e-05,
989
+ "loss": 1.5442,
990
+ "step": 1400
991
+ },
992
+ {
993
+ "epoch": 0.9650924024640657,
994
+ "grad_norm": 6.214602947235107,
995
+ "learning_rate": 3.3926534337211955e-05,
996
+ "loss": 1.483,
997
+ "step": 1410
998
+ },
999
+ {
1000
+ "epoch": 0.971937029431896,
1001
+ "grad_norm": 6.769604682922363,
1002
+ "learning_rate": 3.381245722108145e-05,
1003
+ "loss": 1.5451,
1004
+ "step": 1420
1005
+ },
1006
+ {
1007
+ "epoch": 0.9787816563997263,
1008
+ "grad_norm": 12.718162536621094,
1009
+ "learning_rate": 3.369838010495095e-05,
1010
+ "loss": 1.491,
1011
+ "step": 1430
1012
+ },
1013
+ {
1014
+ "epoch": 0.9856262833675564,
1015
+ "grad_norm": 6.820464611053467,
1016
+ "learning_rate": 3.3584302988820444e-05,
1017
+ "loss": 1.4709,
1018
+ "step": 1440
1019
+ },
1020
+ {
1021
+ "epoch": 0.9924709103353867,
1022
+ "grad_norm": 9.189990997314453,
1023
+ "learning_rate": 3.347022587268994e-05,
1024
+ "loss": 1.5013,
1025
+ "step": 1450
1026
+ },
1027
+ {
1028
+ "epoch": 0.999315537303217,
1029
+ "grad_norm": 6.927376747131348,
1030
+ "learning_rate": 3.3356148756559436e-05,
1031
+ "loss": 1.4747,
1032
+ "step": 1460
1033
+ },
1034
+ {
1035
+ "epoch": 1.0061601642710472,
1036
+ "grad_norm": 5.837340831756592,
1037
+ "learning_rate": 3.324207164042893e-05,
1038
+ "loss": 1.5377,
1039
+ "step": 1470
1040
+ },
1041
+ {
1042
+ "epoch": 1.0130047912388775,
1043
+ "grad_norm": 9.09898853302002,
1044
+ "learning_rate": 3.312799452429843e-05,
1045
+ "loss": 1.3863,
1046
+ "step": 1480
1047
+ },
1048
+ {
1049
+ "epoch": 1.0198494182067077,
1050
+ "grad_norm": 8.584604263305664,
1051
+ "learning_rate": 3.3013917408167924e-05,
1052
+ "loss": 1.4328,
1053
+ "step": 1490
1054
+ },
1055
+ {
1056
+ "epoch": 1.0266940451745379,
1057
+ "grad_norm": 4.925144672393799,
1058
+ "learning_rate": 3.289984029203742e-05,
1059
+ "loss": 1.4637,
1060
+ "step": 1500
1061
+ },
1062
+ {
1063
+ "epoch": 1.0335386721423683,
1064
+ "grad_norm": 11.246452331542969,
1065
+ "learning_rate": 3.2785763175906916e-05,
1066
+ "loss": 1.5669,
1067
+ "step": 1510
1068
+ },
1069
+ {
1070
+ "epoch": 1.0403832991101984,
1071
+ "grad_norm": 9.101409912109375,
1072
+ "learning_rate": 3.267168605977641e-05,
1073
+ "loss": 1.3789,
1074
+ "step": 1520
1075
+ },
1076
+ {
1077
+ "epoch": 1.0472279260780288,
1078
+ "grad_norm": 12.243134498596191,
1079
+ "learning_rate": 3.255760894364591e-05,
1080
+ "loss": 1.5386,
1081
+ "step": 1530
1082
+ },
1083
+ {
1084
+ "epoch": 1.054072553045859,
1085
+ "grad_norm": 11.983848571777344,
1086
+ "learning_rate": 3.24435318275154e-05,
1087
+ "loss": 1.5702,
1088
+ "step": 1540
1089
+ },
1090
+ {
1091
+ "epoch": 1.0609171800136892,
1092
+ "grad_norm": 7.911087989807129,
1093
+ "learning_rate": 3.23294547113849e-05,
1094
+ "loss": 1.486,
1095
+ "step": 1550
1096
+ },
1097
+ {
1098
+ "epoch": 1.0677618069815196,
1099
+ "grad_norm": 4.611266136169434,
1100
+ "learning_rate": 3.221537759525439e-05,
1101
+ "loss": 1.503,
1102
+ "step": 1560
1103
+ },
1104
+ {
1105
+ "epoch": 1.0746064339493497,
1106
+ "grad_norm": 5.226015567779541,
1107
+ "learning_rate": 3.210130047912389e-05,
1108
+ "loss": 1.4103,
1109
+ "step": 1570
1110
+ },
1111
+ {
1112
+ "epoch": 1.0814510609171801,
1113
+ "grad_norm": 11.175928115844727,
1114
+ "learning_rate": 3.198722336299338e-05,
1115
+ "loss": 1.4847,
1116
+ "step": 1580
1117
+ },
1118
+ {
1119
+ "epoch": 1.0882956878850103,
1120
+ "grad_norm": 8.721378326416016,
1121
+ "learning_rate": 3.187314624686288e-05,
1122
+ "loss": 1.4551,
1123
+ "step": 1590
1124
+ },
1125
+ {
1126
+ "epoch": 1.0951403148528405,
1127
+ "grad_norm": 8.155839920043945,
1128
+ "learning_rate": 3.175906913073238e-05,
1129
+ "loss": 1.3424,
1130
+ "step": 1600
1131
+ },
1132
+ {
1133
+ "epoch": 1.1019849418206709,
1134
+ "grad_norm": 14.279895782470703,
1135
+ "learning_rate": 3.164499201460187e-05,
1136
+ "loss": 1.411,
1137
+ "step": 1610
1138
+ },
1139
+ {
1140
+ "epoch": 1.108829568788501,
1141
+ "grad_norm": 10.556158065795898,
1142
+ "learning_rate": 3.153091489847137e-05,
1143
+ "loss": 1.3987,
1144
+ "step": 1620
1145
+ },
1146
+ {
1147
+ "epoch": 1.1156741957563312,
1148
+ "grad_norm": 13.61709976196289,
1149
+ "learning_rate": 3.141683778234086e-05,
1150
+ "loss": 1.4496,
1151
+ "step": 1630
1152
+ },
1153
+ {
1154
+ "epoch": 1.1225188227241616,
1155
+ "grad_norm": 10.42212963104248,
1156
+ "learning_rate": 3.1302760666210365e-05,
1157
+ "loss": 1.3844,
1158
+ "step": 1640
1159
+ },
1160
+ {
1161
+ "epoch": 1.1293634496919918,
1162
+ "grad_norm": 11.602677345275879,
1163
+ "learning_rate": 3.1188683550079854e-05,
1164
+ "loss": 1.3904,
1165
+ "step": 1650
1166
+ },
1167
+ {
1168
+ "epoch": 1.136208076659822,
1169
+ "grad_norm": 7.478072643280029,
1170
+ "learning_rate": 3.107460643394935e-05,
1171
+ "loss": 1.4705,
1172
+ "step": 1660
1173
+ },
1174
+ {
1175
+ "epoch": 1.1430527036276523,
1176
+ "grad_norm": 9.382685661315918,
1177
+ "learning_rate": 3.0960529317818846e-05,
1178
+ "loss": 1.3273,
1179
+ "step": 1670
1180
+ },
1181
+ {
1182
+ "epoch": 1.1498973305954825,
1183
+ "grad_norm": 11.722029685974121,
1184
+ "learning_rate": 3.084645220168834e-05,
1185
+ "loss": 1.4704,
1186
+ "step": 1680
1187
+ },
1188
+ {
1189
+ "epoch": 1.1567419575633129,
1190
+ "grad_norm": 6.231897830963135,
1191
+ "learning_rate": 3.073237508555784e-05,
1192
+ "loss": 1.3317,
1193
+ "step": 1690
1194
+ },
1195
+ {
1196
+ "epoch": 1.163586584531143,
1197
+ "grad_norm": 8.364891052246094,
1198
+ "learning_rate": 3.0618297969427334e-05,
1199
+ "loss": 1.3771,
1200
+ "step": 1700
1201
+ },
1202
+ {
1203
+ "epoch": 1.1704312114989732,
1204
+ "grad_norm": 18.594911575317383,
1205
+ "learning_rate": 3.0504220853296827e-05,
1206
+ "loss": 1.3757,
1207
+ "step": 1710
1208
+ },
1209
+ {
1210
+ "epoch": 1.1772758384668036,
1211
+ "grad_norm": 12.341059684753418,
1212
+ "learning_rate": 3.0390143737166326e-05,
1213
+ "loss": 1.2105,
1214
+ "step": 1720
1215
+ },
1216
+ {
1217
+ "epoch": 1.1841204654346338,
1218
+ "grad_norm": 11.751667976379395,
1219
+ "learning_rate": 3.027606662103582e-05,
1220
+ "loss": 1.4507,
1221
+ "step": 1730
1222
+ },
1223
+ {
1224
+ "epoch": 1.1909650924024642,
1225
+ "grad_norm": 9.054430961608887,
1226
+ "learning_rate": 3.016198950490532e-05,
1227
+ "loss": 1.3518,
1228
+ "step": 1740
1229
+ },
1230
+ {
1231
+ "epoch": 1.1978097193702943,
1232
+ "grad_norm": 10.657920837402344,
1233
+ "learning_rate": 3.004791238877481e-05,
1234
+ "loss": 1.3997,
1235
+ "step": 1750
1236
+ },
1237
+ {
1238
+ "epoch": 1.2046543463381245,
1239
+ "grad_norm": 9.505620002746582,
1240
+ "learning_rate": 2.993383527264431e-05,
1241
+ "loss": 1.4132,
1242
+ "step": 1760
1243
+ },
1244
+ {
1245
+ "epoch": 1.211498973305955,
1246
+ "grad_norm": 13.155516624450684,
1247
+ "learning_rate": 2.9819758156513807e-05,
1248
+ "loss": 1.4084,
1249
+ "step": 1770
1250
+ },
1251
+ {
1252
+ "epoch": 1.218343600273785,
1253
+ "grad_norm": 5.281780242919922,
1254
+ "learning_rate": 2.97056810403833e-05,
1255
+ "loss": 1.4072,
1256
+ "step": 1780
1257
+ },
1258
+ {
1259
+ "epoch": 1.2251882272416152,
1260
+ "grad_norm": 9.554500579833984,
1261
+ "learning_rate": 2.95916039242528e-05,
1262
+ "loss": 1.486,
1263
+ "step": 1790
1264
+ },
1265
+ {
1266
+ "epoch": 1.2320328542094456,
1267
+ "grad_norm": 9.720778465270996,
1268
+ "learning_rate": 2.947752680812229e-05,
1269
+ "loss": 1.3155,
1270
+ "step": 1800
1271
+ },
1272
+ {
1273
+ "epoch": 1.2388774811772758,
1274
+ "grad_norm": 11.252222061157227,
1275
+ "learning_rate": 2.936344969199179e-05,
1276
+ "loss": 1.2644,
1277
+ "step": 1810
1278
+ },
1279
+ {
1280
+ "epoch": 1.245722108145106,
1281
+ "grad_norm": 18.367835998535156,
1282
+ "learning_rate": 2.9249372575861283e-05,
1283
+ "loss": 1.3827,
1284
+ "step": 1820
1285
+ },
1286
+ {
1287
+ "epoch": 1.2525667351129364,
1288
+ "grad_norm": 11.841431617736816,
1289
+ "learning_rate": 2.9135295459730776e-05,
1290
+ "loss": 1.2508,
1291
+ "step": 1830
1292
+ },
1293
+ {
1294
+ "epoch": 1.2594113620807665,
1295
+ "grad_norm": 13.085965156555176,
1296
+ "learning_rate": 2.9021218343600276e-05,
1297
+ "loss": 1.4025,
1298
+ "step": 1840
1299
+ },
1300
+ {
1301
+ "epoch": 1.266255989048597,
1302
+ "grad_norm": 15.112276077270508,
1303
+ "learning_rate": 2.8907141227469768e-05,
1304
+ "loss": 1.3554,
1305
+ "step": 1850
1306
+ },
1307
+ {
1308
+ "epoch": 1.273100616016427,
1309
+ "grad_norm": 11.756820678710938,
1310
+ "learning_rate": 2.8793064111339268e-05,
1311
+ "loss": 1.3882,
1312
+ "step": 1860
1313
+ },
1314
+ {
1315
+ "epoch": 1.2799452429842573,
1316
+ "grad_norm": 10.950437545776367,
1317
+ "learning_rate": 2.867898699520876e-05,
1318
+ "loss": 1.3094,
1319
+ "step": 1870
1320
+ },
1321
+ {
1322
+ "epoch": 1.2867898699520877,
1323
+ "grad_norm": 11.606042861938477,
1324
+ "learning_rate": 2.856490987907826e-05,
1325
+ "loss": 1.2479,
1326
+ "step": 1880
1327
+ },
1328
+ {
1329
+ "epoch": 1.2936344969199178,
1330
+ "grad_norm": 8.162748336791992,
1331
+ "learning_rate": 2.8450832762947756e-05,
1332
+ "loss": 1.3937,
1333
+ "step": 1890
1334
+ },
1335
+ {
1336
+ "epoch": 1.3004791238877482,
1337
+ "grad_norm": 10.575776100158691,
1338
+ "learning_rate": 2.833675564681725e-05,
1339
+ "loss": 1.4198,
1340
+ "step": 1900
1341
+ },
1342
+ {
1343
+ "epoch": 1.3073237508555784,
1344
+ "grad_norm": 14.69323444366455,
1345
+ "learning_rate": 2.8222678530686748e-05,
1346
+ "loss": 1.3989,
1347
+ "step": 1910
1348
+ },
1349
+ {
1350
+ "epoch": 1.3141683778234086,
1351
+ "grad_norm": 9.671676635742188,
1352
+ "learning_rate": 2.810860141455624e-05,
1353
+ "loss": 1.323,
1354
+ "step": 1920
1355
+ },
1356
+ {
1357
+ "epoch": 1.321013004791239,
1358
+ "grad_norm": 14.00036334991455,
1359
+ "learning_rate": 2.799452429842574e-05,
1360
+ "loss": 1.4533,
1361
+ "step": 1930
1362
+ },
1363
+ {
1364
+ "epoch": 1.3278576317590691,
1365
+ "grad_norm": 10.33249282836914,
1366
+ "learning_rate": 2.7880447182295233e-05,
1367
+ "loss": 1.4039,
1368
+ "step": 1940
1369
+ },
1370
+ {
1371
+ "epoch": 1.3347022587268995,
1372
+ "grad_norm": 6.003030776977539,
1373
+ "learning_rate": 2.7766370066164725e-05,
1374
+ "loss": 1.2639,
1375
+ "step": 1950
1376
+ },
1377
+ {
1378
+ "epoch": 1.3415468856947297,
1379
+ "grad_norm": 22.026813507080078,
1380
+ "learning_rate": 2.7652292950034225e-05,
1381
+ "loss": 1.4186,
1382
+ "step": 1960
1383
+ },
1384
+ {
1385
+ "epoch": 1.3483915126625599,
1386
+ "grad_norm": 26.090612411499023,
1387
+ "learning_rate": 2.7538215833903717e-05,
1388
+ "loss": 1.391,
1389
+ "step": 1970
1390
+ },
1391
+ {
1392
+ "epoch": 1.35523613963039,
1393
+ "grad_norm": 12.183039665222168,
1394
+ "learning_rate": 2.7424138717773217e-05,
1395
+ "loss": 1.4351,
1396
+ "step": 1980
1397
+ },
1398
+ {
1399
+ "epoch": 1.3620807665982204,
1400
+ "grad_norm": 13.481134414672852,
1401
+ "learning_rate": 2.7310061601642713e-05,
1402
+ "loss": 1.3032,
1403
+ "step": 1990
1404
+ },
1405
+ {
1406
+ "epoch": 1.3689253935660506,
1407
+ "grad_norm": 8.674177169799805,
1408
+ "learning_rate": 2.7195984485512206e-05,
1409
+ "loss": 1.4075,
1410
+ "step": 2000
1411
+ },
1412
+ {
1413
+ "epoch": 1.375770020533881,
1414
+ "grad_norm": 8.313282012939453,
1415
+ "learning_rate": 2.7081907369381705e-05,
1416
+ "loss": 1.2579,
1417
+ "step": 2010
1418
+ },
1419
+ {
1420
+ "epoch": 1.3826146475017111,
1421
+ "grad_norm": 6.6830973625183105,
1422
+ "learning_rate": 2.6967830253251198e-05,
1423
+ "loss": 1.4663,
1424
+ "step": 2020
1425
+ },
1426
+ {
1427
+ "epoch": 1.3894592744695413,
1428
+ "grad_norm": 14.910584449768066,
1429
+ "learning_rate": 2.6853753137120697e-05,
1430
+ "loss": 1.2804,
1431
+ "step": 2030
1432
+ },
1433
+ {
1434
+ "epoch": 1.3963039014373717,
1435
+ "grad_norm": 8.96900463104248,
1436
+ "learning_rate": 2.673967602099019e-05,
1437
+ "loss": 1.451,
1438
+ "step": 2040
1439
+ },
1440
+ {
1441
+ "epoch": 1.4031485284052019,
1442
+ "grad_norm": 9.932779312133789,
1443
+ "learning_rate": 2.662559890485969e-05,
1444
+ "loss": 1.34,
1445
+ "step": 2050
1446
+ },
1447
+ {
1448
+ "epoch": 1.4099931553730323,
1449
+ "grad_norm": 15.112752914428711,
1450
+ "learning_rate": 2.6511521788729182e-05,
1451
+ "loss": 1.3144,
1452
+ "step": 2060
1453
+ },
1454
+ {
1455
+ "epoch": 1.4168377823408624,
1456
+ "grad_norm": 11.542669296264648,
1457
+ "learning_rate": 2.6397444672598675e-05,
1458
+ "loss": 1.3211,
1459
+ "step": 2070
1460
+ },
1461
+ {
1462
+ "epoch": 1.4236824093086926,
1463
+ "grad_norm": 12.117714881896973,
1464
+ "learning_rate": 2.6283367556468174e-05,
1465
+ "loss": 1.301,
1466
+ "step": 2080
1467
+ },
1468
+ {
1469
+ "epoch": 1.430527036276523,
1470
+ "grad_norm": 9.843555450439453,
1471
+ "learning_rate": 2.6169290440337667e-05,
1472
+ "loss": 1.4528,
1473
+ "step": 2090
1474
+ },
1475
+ {
1476
+ "epoch": 1.4373716632443532,
1477
+ "grad_norm": 11.246613502502441,
1478
+ "learning_rate": 2.6055213324207166e-05,
1479
+ "loss": 1.4052,
1480
+ "step": 2100
1481
+ },
1482
+ {
1483
+ "epoch": 1.4442162902121836,
1484
+ "grad_norm": 11.090378761291504,
1485
+ "learning_rate": 2.5941136208076662e-05,
1486
+ "loss": 1.2711,
1487
+ "step": 2110
1488
+ },
1489
+ {
1490
+ "epoch": 1.4510609171800137,
1491
+ "grad_norm": 8.056124687194824,
1492
+ "learning_rate": 2.5827059091946155e-05,
1493
+ "loss": 1.3582,
1494
+ "step": 2120
1495
+ },
1496
+ {
1497
+ "epoch": 1.457905544147844,
1498
+ "grad_norm": 9.219451904296875,
1499
+ "learning_rate": 2.5712981975815654e-05,
1500
+ "loss": 1.4546,
1501
+ "step": 2130
1502
+ },
1503
+ {
1504
+ "epoch": 1.464750171115674,
1505
+ "grad_norm": 8.210469245910645,
1506
+ "learning_rate": 2.5598904859685147e-05,
1507
+ "loss": 1.3248,
1508
+ "step": 2140
1509
+ },
1510
+ {
1511
+ "epoch": 1.4715947980835045,
1512
+ "grad_norm": 15.370203971862793,
1513
+ "learning_rate": 2.5484827743554646e-05,
1514
+ "loss": 1.541,
1515
+ "step": 2150
1516
+ },
1517
+ {
1518
+ "epoch": 1.4784394250513346,
1519
+ "grad_norm": 11.43657112121582,
1520
+ "learning_rate": 2.537075062742414e-05,
1521
+ "loss": 1.3388,
1522
+ "step": 2160
1523
+ },
1524
+ {
1525
+ "epoch": 1.485284052019165,
1526
+ "grad_norm": 16.973400115966797,
1527
+ "learning_rate": 2.525667351129364e-05,
1528
+ "loss": 1.3192,
1529
+ "step": 2170
1530
+ },
1531
+ {
1532
+ "epoch": 1.4921286789869952,
1533
+ "grad_norm": 13.537483215332031,
1534
+ "learning_rate": 2.514259639516313e-05,
1535
+ "loss": 1.3148,
1536
+ "step": 2180
1537
+ },
1538
+ {
1539
+ "epoch": 1.4989733059548254,
1540
+ "grad_norm": 12.264762878417969,
1541
+ "learning_rate": 2.5028519279032624e-05,
1542
+ "loss": 1.2356,
1543
+ "step": 2190
1544
+ },
1545
+ {
1546
+ "epoch": 1.5058179329226558,
1547
+ "grad_norm": 29.134258270263672,
1548
+ "learning_rate": 2.4914442162902123e-05,
1549
+ "loss": 1.4712,
1550
+ "step": 2200
1551
+ },
1552
+ {
1553
+ "epoch": 1.512662559890486,
1554
+ "grad_norm": 14.758463859558105,
1555
+ "learning_rate": 2.4800365046771616e-05,
1556
+ "loss": 1.2829,
1557
+ "step": 2210
1558
+ },
1559
+ {
1560
+ "epoch": 1.5195071868583163,
1561
+ "grad_norm": 16.932037353515625,
1562
+ "learning_rate": 2.4686287930641115e-05,
1563
+ "loss": 1.493,
1564
+ "step": 2220
1565
+ },
1566
+ {
1567
+ "epoch": 1.5263518138261465,
1568
+ "grad_norm": 20.093650817871094,
1569
+ "learning_rate": 2.457221081451061e-05,
1570
+ "loss": 1.3119,
1571
+ "step": 2230
1572
+ },
1573
+ {
1574
+ "epoch": 1.5331964407939767,
1575
+ "grad_norm": 15.346664428710938,
1576
+ "learning_rate": 2.4458133698380107e-05,
1577
+ "loss": 1.382,
1578
+ "step": 2240
1579
+ },
1580
+ {
1581
+ "epoch": 1.5400410677618068,
1582
+ "grad_norm": 6.908050537109375,
1583
+ "learning_rate": 2.4344056582249603e-05,
1584
+ "loss": 1.2735,
1585
+ "step": 2250
1586
+ },
1587
+ {
1588
+ "epoch": 1.5468856947296372,
1589
+ "grad_norm": 17.50838851928711,
1590
+ "learning_rate": 2.42299794661191e-05,
1591
+ "loss": 1.3185,
1592
+ "step": 2260
1593
+ },
1594
+ {
1595
+ "epoch": 1.5537303216974676,
1596
+ "grad_norm": 8.241186141967773,
1597
+ "learning_rate": 2.4115902349988592e-05,
1598
+ "loss": 1.3027,
1599
+ "step": 2270
1600
+ },
1601
+ {
1602
+ "epoch": 1.5605749486652978,
1603
+ "grad_norm": 10.923578262329102,
1604
+ "learning_rate": 2.4001825233858088e-05,
1605
+ "loss": 1.3528,
1606
+ "step": 2280
1607
+ },
1608
+ {
1609
+ "epoch": 1.567419575633128,
1610
+ "grad_norm": 15.049988746643066,
1611
+ "learning_rate": 2.3887748117727584e-05,
1612
+ "loss": 1.4557,
1613
+ "step": 2290
1614
+ },
1615
+ {
1616
+ "epoch": 1.5742642026009581,
1617
+ "grad_norm": 13.151463508605957,
1618
+ "learning_rate": 2.377367100159708e-05,
1619
+ "loss": 1.3979,
1620
+ "step": 2300
1621
+ },
1622
+ {
1623
+ "epoch": 1.5811088295687885,
1624
+ "grad_norm": 11.79454517364502,
1625
+ "learning_rate": 2.3659593885466576e-05,
1626
+ "loss": 1.3471,
1627
+ "step": 2310
1628
+ },
1629
+ {
1630
+ "epoch": 1.587953456536619,
1631
+ "grad_norm": 15.728011131286621,
1632
+ "learning_rate": 2.3545516769336072e-05,
1633
+ "loss": 1.3058,
1634
+ "step": 2320
1635
+ },
1636
+ {
1637
+ "epoch": 1.594798083504449,
1638
+ "grad_norm": 8.729684829711914,
1639
+ "learning_rate": 2.343143965320557e-05,
1640
+ "loss": 1.3549,
1641
+ "step": 2330
1642
+ },
1643
+ {
1644
+ "epoch": 1.6016427104722792,
1645
+ "grad_norm": 13.094536781311035,
1646
+ "learning_rate": 2.3317362537075065e-05,
1647
+ "loss": 1.3727,
1648
+ "step": 2340
1649
+ },
1650
+ {
1651
+ "epoch": 1.6084873374401094,
1652
+ "grad_norm": 13.077216148376465,
1653
+ "learning_rate": 2.320328542094456e-05,
1654
+ "loss": 1.3528,
1655
+ "step": 2350
1656
+ },
1657
+ {
1658
+ "epoch": 1.6153319644079398,
1659
+ "grad_norm": 8.378471374511719,
1660
+ "learning_rate": 2.3089208304814057e-05,
1661
+ "loss": 1.3193,
1662
+ "step": 2360
1663
+ },
1664
+ {
1665
+ "epoch": 1.62217659137577,
1666
+ "grad_norm": 13.543414115905762,
1667
+ "learning_rate": 2.2975131188683553e-05,
1668
+ "loss": 1.3609,
1669
+ "step": 2370
1670
+ },
1671
+ {
1672
+ "epoch": 1.6290212183436004,
1673
+ "grad_norm": 8.93952465057373,
1674
+ "learning_rate": 2.286105407255305e-05,
1675
+ "loss": 1.3854,
1676
+ "step": 2380
1677
+ },
1678
+ {
1679
+ "epoch": 1.6358658453114305,
1680
+ "grad_norm": 10.618241310119629,
1681
+ "learning_rate": 2.274697695642254e-05,
1682
+ "loss": 1.3382,
1683
+ "step": 2390
1684
+ },
1685
+ {
1686
+ "epoch": 1.6427104722792607,
1687
+ "grad_norm": 16.9578857421875,
1688
+ "learning_rate": 2.2632899840292038e-05,
1689
+ "loss": 1.1266,
1690
+ "step": 2400
1691
+ },
1692
+ {
1693
+ "epoch": 1.6495550992470909,
1694
+ "grad_norm": 4.807933330535889,
1695
+ "learning_rate": 2.2518822724161534e-05,
1696
+ "loss": 1.2828,
1697
+ "step": 2410
1698
+ },
1699
+ {
1700
+ "epoch": 1.6563997262149213,
1701
+ "grad_norm": 17.285863876342773,
1702
+ "learning_rate": 2.240474560803103e-05,
1703
+ "loss": 1.4196,
1704
+ "step": 2420
1705
+ },
1706
+ {
1707
+ "epoch": 1.6632443531827517,
1708
+ "grad_norm": 8.696662902832031,
1709
+ "learning_rate": 2.2290668491900526e-05,
1710
+ "loss": 1.2641,
1711
+ "step": 2430
1712
+ },
1713
+ {
1714
+ "epoch": 1.6700889801505818,
1715
+ "grad_norm": 17.971515655517578,
1716
+ "learning_rate": 2.2176591375770022e-05,
1717
+ "loss": 1.3364,
1718
+ "step": 2440
1719
+ },
1720
+ {
1721
+ "epoch": 1.676933607118412,
1722
+ "grad_norm": 29.861949920654297,
1723
+ "learning_rate": 2.2062514259639518e-05,
1724
+ "loss": 1.2867,
1725
+ "step": 2450
1726
+ },
1727
+ {
1728
+ "epoch": 1.6837782340862422,
1729
+ "grad_norm": 21.080684661865234,
1730
+ "learning_rate": 2.1948437143509014e-05,
1731
+ "loss": 1.407,
1732
+ "step": 2460
1733
+ },
1734
+ {
1735
+ "epoch": 1.6906228610540726,
1736
+ "grad_norm": 11.527396202087402,
1737
+ "learning_rate": 2.183436002737851e-05,
1738
+ "loss": 1.3941,
1739
+ "step": 2470
1740
+ },
1741
+ {
1742
+ "epoch": 1.697467488021903,
1743
+ "grad_norm": 10.472256660461426,
1744
+ "learning_rate": 2.1720282911248006e-05,
1745
+ "loss": 1.3766,
1746
+ "step": 2480
1747
+ },
1748
+ {
1749
+ "epoch": 1.7043121149897331,
1750
+ "grad_norm": 10.66096019744873,
1751
+ "learning_rate": 2.1606205795117502e-05,
1752
+ "loss": 1.4121,
1753
+ "step": 2490
1754
+ },
1755
+ {
1756
+ "epoch": 1.7111567419575633,
1757
+ "grad_norm": 10.251544952392578,
1758
+ "learning_rate": 2.1492128678986998e-05,
1759
+ "loss": 1.3294,
1760
+ "step": 2500
1761
+ },
1762
+ {
1763
+ "epoch": 1.7180013689253935,
1764
+ "grad_norm": 8.439862251281738,
1765
+ "learning_rate": 2.137805156285649e-05,
1766
+ "loss": 1.2409,
1767
+ "step": 2510
1768
+ },
1769
+ {
1770
+ "epoch": 1.7248459958932238,
1771
+ "grad_norm": 12.634255409240723,
1772
+ "learning_rate": 2.1263974446725987e-05,
1773
+ "loss": 1.3046,
1774
+ "step": 2520
1775
+ },
1776
+ {
1777
+ "epoch": 1.731690622861054,
1778
+ "grad_norm": 9.006020545959473,
1779
+ "learning_rate": 2.1149897330595483e-05,
1780
+ "loss": 1.2067,
1781
+ "step": 2530
1782
+ },
1783
+ {
1784
+ "epoch": 1.7385352498288844,
1785
+ "grad_norm": 8.736598014831543,
1786
+ "learning_rate": 2.103582021446498e-05,
1787
+ "loss": 1.253,
1788
+ "step": 2540
1789
+ },
1790
+ {
1791
+ "epoch": 1.7453798767967146,
1792
+ "grad_norm": 13.842065811157227,
1793
+ "learning_rate": 2.0921743098334475e-05,
1794
+ "loss": 1.3179,
1795
+ "step": 2550
1796
+ },
1797
+ {
1798
+ "epoch": 1.7522245037645447,
1799
+ "grad_norm": 9.174357414245605,
1800
+ "learning_rate": 2.080766598220397e-05,
1801
+ "loss": 1.2219,
1802
+ "step": 2560
1803
+ },
1804
+ {
1805
+ "epoch": 1.759069130732375,
1806
+ "grad_norm": 12.11344051361084,
1807
+ "learning_rate": 2.0693588866073467e-05,
1808
+ "loss": 1.2061,
1809
+ "step": 2570
1810
+ },
1811
+ {
1812
+ "epoch": 1.7659137577002053,
1813
+ "grad_norm": 18.4387264251709,
1814
+ "learning_rate": 2.0579511749942963e-05,
1815
+ "loss": 1.2647,
1816
+ "step": 2580
1817
+ },
1818
+ {
1819
+ "epoch": 1.7727583846680357,
1820
+ "grad_norm": 13.510963439941406,
1821
+ "learning_rate": 2.046543463381246e-05,
1822
+ "loss": 1.3283,
1823
+ "step": 2590
1824
+ },
1825
+ {
1826
+ "epoch": 1.7796030116358659,
1827
+ "grad_norm": 13.67849349975586,
1828
+ "learning_rate": 2.0351357517681955e-05,
1829
+ "loss": 1.2672,
1830
+ "step": 2600
1831
+ },
1832
+ {
1833
+ "epoch": 1.786447638603696,
1834
+ "grad_norm": 15.734115600585938,
1835
+ "learning_rate": 2.023728040155145e-05,
1836
+ "loss": 1.1421,
1837
+ "step": 2610
1838
+ },
1839
+ {
1840
+ "epoch": 1.7932922655715262,
1841
+ "grad_norm": 17.551671981811523,
1842
+ "learning_rate": 2.0123203285420947e-05,
1843
+ "loss": 1.2805,
1844
+ "step": 2620
1845
+ },
1846
+ {
1847
+ "epoch": 1.8001368925393566,
1848
+ "grad_norm": 10.481766700744629,
1849
+ "learning_rate": 2.000912616929044e-05,
1850
+ "loss": 1.3054,
1851
+ "step": 2630
1852
+ },
1853
+ {
1854
+ "epoch": 1.806981519507187,
1855
+ "grad_norm": 12.874540328979492,
1856
+ "learning_rate": 1.9895049053159936e-05,
1857
+ "loss": 1.3856,
1858
+ "step": 2640
1859
+ },
1860
+ {
1861
+ "epoch": 1.8138261464750172,
1862
+ "grad_norm": 7.882504940032959,
1863
+ "learning_rate": 1.9780971937029432e-05,
1864
+ "loss": 1.3146,
1865
+ "step": 2650
1866
+ },
1867
+ {
1868
+ "epoch": 1.8206707734428473,
1869
+ "grad_norm": 13.011261940002441,
1870
+ "learning_rate": 1.9666894820898928e-05,
1871
+ "loss": 1.3287,
1872
+ "step": 2660
1873
+ },
1874
+ {
1875
+ "epoch": 1.8275154004106775,
1876
+ "grad_norm": 9.366372108459473,
1877
+ "learning_rate": 1.9552817704768424e-05,
1878
+ "loss": 1.2263,
1879
+ "step": 2670
1880
+ },
1881
+ {
1882
+ "epoch": 1.834360027378508,
1883
+ "grad_norm": 8.002914428710938,
1884
+ "learning_rate": 1.943874058863792e-05,
1885
+ "loss": 1.3044,
1886
+ "step": 2680
1887
+ },
1888
+ {
1889
+ "epoch": 1.8412046543463383,
1890
+ "grad_norm": 9.616382598876953,
1891
+ "learning_rate": 1.9324663472507416e-05,
1892
+ "loss": 1.2764,
1893
+ "step": 2690
1894
+ },
1895
+ {
1896
+ "epoch": 1.8480492813141685,
1897
+ "grad_norm": 18.51015853881836,
1898
+ "learning_rate": 1.9210586356376912e-05,
1899
+ "loss": 1.4593,
1900
+ "step": 2700
1901
+ },
1902
+ {
1903
+ "epoch": 1.8548939082819986,
1904
+ "grad_norm": 12.567821502685547,
1905
+ "learning_rate": 1.909650924024641e-05,
1906
+ "loss": 1.2796,
1907
+ "step": 2710
1908
+ },
1909
+ {
1910
+ "epoch": 1.8617385352498288,
1911
+ "grad_norm": 9.510887145996094,
1912
+ "learning_rate": 1.8982432124115904e-05,
1913
+ "loss": 1.2913,
1914
+ "step": 2720
1915
+ },
1916
+ {
1917
+ "epoch": 1.8685831622176592,
1918
+ "grad_norm": 6.420870304107666,
1919
+ "learning_rate": 1.88683550079854e-05,
1920
+ "loss": 1.2753,
1921
+ "step": 2730
1922
+ },
1923
+ {
1924
+ "epoch": 1.8754277891854894,
1925
+ "grad_norm": 15.304129600524902,
1926
+ "learning_rate": 1.8754277891854896e-05,
1927
+ "loss": 1.4278,
1928
+ "step": 2740
1929
+ },
1930
+ {
1931
+ "epoch": 1.8822724161533197,
1932
+ "grad_norm": 9.933164596557617,
1933
+ "learning_rate": 1.864020077572439e-05,
1934
+ "loss": 1.324,
1935
+ "step": 2750
1936
+ },
1937
+ {
1938
+ "epoch": 1.88911704312115,
1939
+ "grad_norm": 10.273710250854492,
1940
+ "learning_rate": 1.8526123659593885e-05,
1941
+ "loss": 1.241,
1942
+ "step": 2760
1943
+ },
1944
+ {
1945
+ "epoch": 1.89596167008898,
1946
+ "grad_norm": 18.104686737060547,
1947
+ "learning_rate": 1.841204654346338e-05,
1948
+ "loss": 1.1503,
1949
+ "step": 2770
1950
+ },
1951
+ {
1952
+ "epoch": 1.9028062970568103,
1953
+ "grad_norm": 8.989583969116211,
1954
+ "learning_rate": 1.8297969427332877e-05,
1955
+ "loss": 1.1591,
1956
+ "step": 2780
1957
+ },
1958
+ {
1959
+ "epoch": 1.9096509240246407,
1960
+ "grad_norm": 9.96402359008789,
1961
+ "learning_rate": 1.8183892311202373e-05,
1962
+ "loss": 1.4079,
1963
+ "step": 2790
1964
+ },
1965
+ {
1966
+ "epoch": 1.916495550992471,
1967
+ "grad_norm": 13.352839469909668,
1968
+ "learning_rate": 1.806981519507187e-05,
1969
+ "loss": 1.3718,
1970
+ "step": 2800
1971
+ },
1972
+ {
1973
+ "epoch": 1.9233401779603012,
1974
+ "grad_norm": 15.031063079833984,
1975
+ "learning_rate": 1.7955738078941365e-05,
1976
+ "loss": 1.3096,
1977
+ "step": 2810
1978
+ },
1979
+ {
1980
+ "epoch": 1.9301848049281314,
1981
+ "grad_norm": 17.026588439941406,
1982
+ "learning_rate": 1.784166096281086e-05,
1983
+ "loss": 1.2287,
1984
+ "step": 2820
1985
+ },
1986
+ {
1987
+ "epoch": 1.9370294318959616,
1988
+ "grad_norm": 8.791502952575684,
1989
+ "learning_rate": 1.7727583846680358e-05,
1990
+ "loss": 1.3572,
1991
+ "step": 2830
1992
+ },
1993
+ {
1994
+ "epoch": 1.943874058863792,
1995
+ "grad_norm": 14.80101203918457,
1996
+ "learning_rate": 1.7613506730549854e-05,
1997
+ "loss": 1.2569,
1998
+ "step": 2840
1999
+ },
2000
+ {
2001
+ "epoch": 1.9507186858316223,
2002
+ "grad_norm": 10.527932167053223,
2003
+ "learning_rate": 1.749942961441935e-05,
2004
+ "loss": 1.2803,
2005
+ "step": 2850
2006
+ },
2007
+ {
2008
+ "epoch": 1.9575633127994525,
2009
+ "grad_norm": 26.88188934326172,
2010
+ "learning_rate": 1.7385352498288842e-05,
2011
+ "loss": 1.3006,
2012
+ "step": 2860
2013
+ },
2014
+ {
2015
+ "epoch": 1.9644079397672827,
2016
+ "grad_norm": 10.920092582702637,
2017
+ "learning_rate": 1.727127538215834e-05,
2018
+ "loss": 1.3342,
2019
+ "step": 2870
2020
+ },
2021
+ {
2022
+ "epoch": 1.9712525667351128,
2023
+ "grad_norm": 17.363473892211914,
2024
+ "learning_rate": 1.7157198266027834e-05,
2025
+ "loss": 1.1818,
2026
+ "step": 2880
2027
+ },
2028
+ {
2029
+ "epoch": 1.9780971937029432,
2030
+ "grad_norm": 12.1474027633667,
2031
+ "learning_rate": 1.704312114989733e-05,
2032
+ "loss": 1.268,
2033
+ "step": 2890
2034
+ },
2035
+ {
2036
+ "epoch": 1.9849418206707734,
2037
+ "grad_norm": 13.043349266052246,
2038
+ "learning_rate": 1.6929044033766827e-05,
2039
+ "loss": 1.3654,
2040
+ "step": 2900
2041
+ },
2042
+ {
2043
+ "epoch": 1.9917864476386038,
2044
+ "grad_norm": 8.357322692871094,
2045
+ "learning_rate": 1.6814966917636323e-05,
2046
+ "loss": 1.2089,
2047
+ "step": 2910
2048
+ },
2049
+ {
2050
+ "epoch": 1.998631074606434,
2051
+ "grad_norm": 9.498217582702637,
2052
+ "learning_rate": 1.670088980150582e-05,
2053
+ "loss": 1.2394,
2054
+ "step": 2920
2055
+ },
2056
+ {
2057
+ "epoch": 2.005475701574264,
2058
+ "grad_norm": 9.406332015991211,
2059
+ "learning_rate": 1.6586812685375315e-05,
2060
+ "loss": 1.1561,
2061
+ "step": 2930
2062
+ },
2063
+ {
2064
+ "epoch": 2.0123203285420943,
2065
+ "grad_norm": 13.764726638793945,
2066
+ "learning_rate": 1.647273556924481e-05,
2067
+ "loss": 1.1865,
2068
+ "step": 2940
2069
+ },
2070
+ {
2071
+ "epoch": 2.019164955509925,
2072
+ "grad_norm": 10.511940002441406,
2073
+ "learning_rate": 1.6358658453114307e-05,
2074
+ "loss": 1.1132,
2075
+ "step": 2950
2076
+ },
2077
+ {
2078
+ "epoch": 2.026009582477755,
2079
+ "grad_norm": 6.873147010803223,
2080
+ "learning_rate": 1.6244581336983803e-05,
2081
+ "loss": 0.9781,
2082
+ "step": 2960
2083
+ },
2084
+ {
2085
+ "epoch": 2.0328542094455853,
2086
+ "grad_norm": 21.03345489501953,
2087
+ "learning_rate": 1.61305042208533e-05,
2088
+ "loss": 1.1766,
2089
+ "step": 2970
2090
+ },
2091
+ {
2092
+ "epoch": 2.0396988364134154,
2093
+ "grad_norm": 12.291481018066406,
2094
+ "learning_rate": 1.601642710472279e-05,
2095
+ "loss": 1.0643,
2096
+ "step": 2980
2097
+ },
2098
+ {
2099
+ "epoch": 2.0465434633812456,
2100
+ "grad_norm": 8.3342866897583,
2101
+ "learning_rate": 1.5902349988592288e-05,
2102
+ "loss": 0.9735,
2103
+ "step": 2990
2104
+ },
2105
+ {
2106
+ "epoch": 2.0533880903490758,
2107
+ "grad_norm": 8.835681915283203,
2108
+ "learning_rate": 1.5788272872461784e-05,
2109
+ "loss": 1.1004,
2110
+ "step": 3000
2111
+ },
2112
+ {
2113
+ "epoch": 2.0602327173169064,
2114
+ "grad_norm": 12.306584358215332,
2115
+ "learning_rate": 1.567419575633128e-05,
2116
+ "loss": 1.086,
2117
+ "step": 3010
2118
+ },
2119
+ {
2120
+ "epoch": 2.0670773442847366,
2121
+ "grad_norm": 16.78770637512207,
2122
+ "learning_rate": 1.5560118640200776e-05,
2123
+ "loss": 1.1036,
2124
+ "step": 3020
2125
+ },
2126
+ {
2127
+ "epoch": 2.0739219712525667,
2128
+ "grad_norm": 11.191411972045898,
2129
+ "learning_rate": 1.5446041524070272e-05,
2130
+ "loss": 1.0532,
2131
+ "step": 3030
2132
+ },
2133
+ {
2134
+ "epoch": 2.080766598220397,
2135
+ "grad_norm": 21.93419647216797,
2136
+ "learning_rate": 1.5331964407939768e-05,
2137
+ "loss": 0.9814,
2138
+ "step": 3040
2139
+ },
2140
+ {
2141
+ "epoch": 2.087611225188227,
2142
+ "grad_norm": 13.316761016845703,
2143
+ "learning_rate": 1.5217887291809262e-05,
2144
+ "loss": 1.0223,
2145
+ "step": 3050
2146
+ },
2147
+ {
2148
+ "epoch": 2.0944558521560577,
2149
+ "grad_norm": 23.466629028320312,
2150
+ "learning_rate": 1.510381017567876e-05,
2151
+ "loss": 1.1022,
2152
+ "step": 3060
2153
+ },
2154
+ {
2155
+ "epoch": 2.101300479123888,
2156
+ "grad_norm": 8.202319145202637,
2157
+ "learning_rate": 1.4989733059548256e-05,
2158
+ "loss": 1.0584,
2159
+ "step": 3070
2160
+ },
2161
+ {
2162
+ "epoch": 2.108145106091718,
2163
+ "grad_norm": 11.632257461547852,
2164
+ "learning_rate": 1.4875655943417752e-05,
2165
+ "loss": 1.0143,
2166
+ "step": 3080
2167
+ },
2168
+ {
2169
+ "epoch": 2.114989733059548,
2170
+ "grad_norm": 16.47410011291504,
2171
+ "learning_rate": 1.4761578827287248e-05,
2172
+ "loss": 0.9763,
2173
+ "step": 3090
2174
+ },
2175
+ {
2176
+ "epoch": 2.1218343600273784,
2177
+ "grad_norm": 15.501402854919434,
2178
+ "learning_rate": 1.464750171115674e-05,
2179
+ "loss": 0.8621,
2180
+ "step": 3100
2181
+ },
2182
+ {
2183
+ "epoch": 2.128678986995209,
2184
+ "grad_norm": 9.7792329788208,
2185
+ "learning_rate": 1.4533424595026237e-05,
2186
+ "loss": 1.2403,
2187
+ "step": 3110
2188
+ },
2189
+ {
2190
+ "epoch": 2.135523613963039,
2191
+ "grad_norm": 12.386256217956543,
2192
+ "learning_rate": 1.4419347478895735e-05,
2193
+ "loss": 0.9812,
2194
+ "step": 3120
2195
+ },
2196
+ {
2197
+ "epoch": 2.1423682409308693,
2198
+ "grad_norm": 10.27375602722168,
2199
+ "learning_rate": 1.430527036276523e-05,
2200
+ "loss": 1.1645,
2201
+ "step": 3130
2202
+ },
2203
+ {
2204
+ "epoch": 2.1492128678986995,
2205
+ "grad_norm": 13.420565605163574,
2206
+ "learning_rate": 1.4191193246634727e-05,
2207
+ "loss": 0.9709,
2208
+ "step": 3140
2209
+ },
2210
+ {
2211
+ "epoch": 2.1560574948665296,
2212
+ "grad_norm": 13.358121871948242,
2213
+ "learning_rate": 1.4077116130504223e-05,
2214
+ "loss": 0.9884,
2215
+ "step": 3150
2216
+ },
2217
+ {
2218
+ "epoch": 2.1629021218343603,
2219
+ "grad_norm": 11.175071716308594,
2220
+ "learning_rate": 1.3963039014373715e-05,
2221
+ "loss": 1.023,
2222
+ "step": 3160
2223
+ },
2224
+ {
2225
+ "epoch": 2.1697467488021904,
2226
+ "grad_norm": 12.370025634765625,
2227
+ "learning_rate": 1.3848961898243213e-05,
2228
+ "loss": 1.0526,
2229
+ "step": 3170
2230
+ },
2231
+ {
2232
+ "epoch": 2.1765913757700206,
2233
+ "grad_norm": 15.10377311706543,
2234
+ "learning_rate": 1.373488478211271e-05,
2235
+ "loss": 0.9864,
2236
+ "step": 3180
2237
+ },
2238
+ {
2239
+ "epoch": 2.1834360027378508,
2240
+ "grad_norm": 11.344932556152344,
2241
+ "learning_rate": 1.3620807665982205e-05,
2242
+ "loss": 1.1532,
2243
+ "step": 3190
2244
+ },
2245
+ {
2246
+ "epoch": 2.190280629705681,
2247
+ "grad_norm": 10.86037826538086,
2248
+ "learning_rate": 1.3506730549851701e-05,
2249
+ "loss": 1.0622,
2250
+ "step": 3200
2251
+ },
2252
+ {
2253
+ "epoch": 2.197125256673511,
2254
+ "grad_norm": 21.841835021972656,
2255
+ "learning_rate": 1.3392653433721197e-05,
2256
+ "loss": 1.168,
2257
+ "step": 3210
2258
+ },
2259
+ {
2260
+ "epoch": 2.2039698836413417,
2261
+ "grad_norm": 15.024373054504395,
2262
+ "learning_rate": 1.327857631759069e-05,
2263
+ "loss": 0.9055,
2264
+ "step": 3220
2265
+ },
2266
+ {
2267
+ "epoch": 2.210814510609172,
2268
+ "grad_norm": 20.061037063598633,
2269
+ "learning_rate": 1.3164499201460188e-05,
2270
+ "loss": 1.0555,
2271
+ "step": 3230
2272
+ },
2273
+ {
2274
+ "epoch": 2.217659137577002,
2275
+ "grad_norm": 11.157723426818848,
2276
+ "learning_rate": 1.3050422085329684e-05,
2277
+ "loss": 0.9714,
2278
+ "step": 3240
2279
+ },
2280
+ {
2281
+ "epoch": 2.2245037645448322,
2282
+ "grad_norm": 13.620951652526855,
2283
+ "learning_rate": 1.293634496919918e-05,
2284
+ "loss": 0.9512,
2285
+ "step": 3250
2286
+ },
2287
+ {
2288
+ "epoch": 2.2313483915126624,
2289
+ "grad_norm": 17.227863311767578,
2290
+ "learning_rate": 1.2822267853068676e-05,
2291
+ "loss": 1.0676,
2292
+ "step": 3260
2293
+ },
2294
+ {
2295
+ "epoch": 2.238193018480493,
2296
+ "grad_norm": 14.115557670593262,
2297
+ "learning_rate": 1.2708190736938172e-05,
2298
+ "loss": 1.0418,
2299
+ "step": 3270
2300
+ },
2301
+ {
2302
+ "epoch": 2.245037645448323,
2303
+ "grad_norm": 14.355422973632812,
2304
+ "learning_rate": 1.2594113620807665e-05,
2305
+ "loss": 0.945,
2306
+ "step": 3280
2307
+ },
2308
+ {
2309
+ "epoch": 2.2518822724161534,
2310
+ "grad_norm": 11.699935913085938,
2311
+ "learning_rate": 1.2480036504677162e-05,
2312
+ "loss": 0.9937,
2313
+ "step": 3290
2314
+ },
2315
+ {
2316
+ "epoch": 2.2587268993839835,
2317
+ "grad_norm": 17.60722541809082,
2318
+ "learning_rate": 1.2365959388546658e-05,
2319
+ "loss": 0.9713,
2320
+ "step": 3300
2321
+ },
2322
+ {
2323
+ "epoch": 2.2655715263518137,
2324
+ "grad_norm": 19.072879791259766,
2325
+ "learning_rate": 1.2251882272416155e-05,
2326
+ "loss": 1.0041,
2327
+ "step": 3310
2328
+ },
2329
+ {
2330
+ "epoch": 2.272416153319644,
2331
+ "grad_norm": 17.29334831237793,
2332
+ "learning_rate": 1.2137805156285649e-05,
2333
+ "loss": 1.0048,
2334
+ "step": 3320
2335
+ },
2336
+ {
2337
+ "epoch": 2.2792607802874745,
2338
+ "grad_norm": 14.827413558959961,
2339
+ "learning_rate": 1.2023728040155145e-05,
2340
+ "loss": 0.9177,
2341
+ "step": 3330
2342
+ },
2343
+ {
2344
+ "epoch": 2.2861054072553046,
2345
+ "grad_norm": 41.71753692626953,
2346
+ "learning_rate": 1.1909650924024641e-05,
2347
+ "loss": 1.0942,
2348
+ "step": 3340
2349
+ },
2350
+ {
2351
+ "epoch": 2.292950034223135,
2352
+ "grad_norm": 37.5557861328125,
2353
+ "learning_rate": 1.1795573807894137e-05,
2354
+ "loss": 1.0733,
2355
+ "step": 3350
2356
+ },
2357
+ {
2358
+ "epoch": 2.299794661190965,
2359
+ "grad_norm": 10.807480812072754,
2360
+ "learning_rate": 1.1681496691763633e-05,
2361
+ "loss": 0.7737,
2362
+ "step": 3360
2363
+ },
2364
+ {
2365
+ "epoch": 2.306639288158795,
2366
+ "grad_norm": 26.820894241333008,
2367
+ "learning_rate": 1.1567419575633129e-05,
2368
+ "loss": 1.2099,
2369
+ "step": 3370
2370
+ },
2371
+ {
2372
+ "epoch": 2.3134839151266258,
2373
+ "grad_norm": 18.552034378051758,
2374
+ "learning_rate": 1.1453342459502623e-05,
2375
+ "loss": 1.0619,
2376
+ "step": 3380
2377
+ },
2378
+ {
2379
+ "epoch": 2.320328542094456,
2380
+ "grad_norm": 16.6098690032959,
2381
+ "learning_rate": 1.133926534337212e-05,
2382
+ "loss": 0.9676,
2383
+ "step": 3390
2384
+ },
2385
+ {
2386
+ "epoch": 2.327173169062286,
2387
+ "grad_norm": 21.494001388549805,
2388
+ "learning_rate": 1.1225188227241616e-05,
2389
+ "loss": 0.9241,
2390
+ "step": 3400
2391
+ },
2392
+ {
2393
+ "epoch": 2.3340177960301163,
2394
+ "grad_norm": 24.1474609375,
2395
+ "learning_rate": 1.1111111111111112e-05,
2396
+ "loss": 1.2263,
2397
+ "step": 3410
2398
+ },
2399
+ {
2400
+ "epoch": 2.3408624229979464,
2401
+ "grad_norm": 19.26318359375,
2402
+ "learning_rate": 1.0997033994980608e-05,
2403
+ "loss": 1.1812,
2404
+ "step": 3420
2405
+ },
2406
+ {
2407
+ "epoch": 2.347707049965777,
2408
+ "grad_norm": 21.686351776123047,
2409
+ "learning_rate": 1.0882956878850104e-05,
2410
+ "loss": 1.0536,
2411
+ "step": 3430
2412
+ },
2413
+ {
2414
+ "epoch": 2.3545516769336072,
2415
+ "grad_norm": 18.21807861328125,
2416
+ "learning_rate": 1.0768879762719598e-05,
2417
+ "loss": 1.0296,
2418
+ "step": 3440
2419
+ },
2420
+ {
2421
+ "epoch": 2.3613963039014374,
2422
+ "grad_norm": 11.440802574157715,
2423
+ "learning_rate": 1.0654802646589094e-05,
2424
+ "loss": 1.055,
2425
+ "step": 3450
2426
+ },
2427
+ {
2428
+ "epoch": 2.3682409308692676,
2429
+ "grad_norm": 14.234994888305664,
2430
+ "learning_rate": 1.054072553045859e-05,
2431
+ "loss": 1.0369,
2432
+ "step": 3460
2433
+ },
2434
+ {
2435
+ "epoch": 2.3750855578370977,
2436
+ "grad_norm": 8.702170372009277,
2437
+ "learning_rate": 1.0426648414328086e-05,
2438
+ "loss": 0.84,
2439
+ "step": 3470
2440
+ },
2441
+ {
2442
+ "epoch": 2.3819301848049284,
2443
+ "grad_norm": 20.194971084594727,
2444
+ "learning_rate": 1.0312571298197582e-05,
2445
+ "loss": 0.9005,
2446
+ "step": 3480
2447
+ },
2448
+ {
2449
+ "epoch": 2.3887748117727585,
2450
+ "grad_norm": 14.985250473022461,
2451
+ "learning_rate": 1.0198494182067078e-05,
2452
+ "loss": 0.9658,
2453
+ "step": 3490
2454
+ },
2455
+ {
2456
+ "epoch": 2.3956194387405887,
2457
+ "grad_norm": 11.611641883850098,
2458
+ "learning_rate": 1.0084417065936573e-05,
2459
+ "loss": 0.9147,
2460
+ "step": 3500
2461
+ },
2462
+ {
2463
+ "epoch": 2.402464065708419,
2464
+ "grad_norm": 21.909147262573242,
2465
+ "learning_rate": 9.970339949806069e-06,
2466
+ "loss": 1.0705,
2467
+ "step": 3510
2468
+ },
2469
+ {
2470
+ "epoch": 2.409308692676249,
2471
+ "grad_norm": 21.859275817871094,
2472
+ "learning_rate": 9.856262833675565e-06,
2473
+ "loss": 1.1039,
2474
+ "step": 3520
2475
+ },
2476
+ {
2477
+ "epoch": 2.4161533196440796,
2478
+ "grad_norm": 17.57880973815918,
2479
+ "learning_rate": 9.742185717545061e-06,
2480
+ "loss": 1.116,
2481
+ "step": 3530
2482
+ },
2483
+ {
2484
+ "epoch": 2.42299794661191,
2485
+ "grad_norm": 11.606178283691406,
2486
+ "learning_rate": 9.628108601414557e-06,
2487
+ "loss": 0.9322,
2488
+ "step": 3540
2489
+ },
2490
+ {
2491
+ "epoch": 2.42984257357974,
2492
+ "grad_norm": 15.382245063781738,
2493
+ "learning_rate": 9.514031485284053e-06,
2494
+ "loss": 0.9983,
2495
+ "step": 3550
2496
+ },
2497
+ {
2498
+ "epoch": 2.43668720054757,
2499
+ "grad_norm": 18.811206817626953,
2500
+ "learning_rate": 9.399954369153547e-06,
2501
+ "loss": 1.152,
2502
+ "step": 3560
2503
+ },
2504
+ {
2505
+ "epoch": 2.4435318275154003,
2506
+ "grad_norm": 15.6809663772583,
2507
+ "learning_rate": 9.285877253023043e-06,
2508
+ "loss": 0.9129,
2509
+ "step": 3570
2510
+ },
2511
+ {
2512
+ "epoch": 2.4503764544832305,
2513
+ "grad_norm": 16.564800262451172,
2514
+ "learning_rate": 9.17180013689254e-06,
2515
+ "loss": 1.0157,
2516
+ "step": 3580
2517
+ },
2518
+ {
2519
+ "epoch": 2.457221081451061,
2520
+ "grad_norm": 17.94131088256836,
2521
+ "learning_rate": 9.057723020762035e-06,
2522
+ "loss": 0.9018,
2523
+ "step": 3590
2524
+ },
2525
+ {
2526
+ "epoch": 2.4640657084188913,
2527
+ "grad_norm": 12.62234878540039,
2528
+ "learning_rate": 8.943645904631532e-06,
2529
+ "loss": 1.0834,
2530
+ "step": 3600
2531
+ },
2532
+ {
2533
+ "epoch": 2.4709103353867214,
2534
+ "grad_norm": 19.0474910736084,
2535
+ "learning_rate": 8.829568788501028e-06,
2536
+ "loss": 1.035,
2537
+ "step": 3610
2538
+ },
2539
+ {
2540
+ "epoch": 2.4777549623545516,
2541
+ "grad_norm": 13.592283248901367,
2542
+ "learning_rate": 8.715491672370522e-06,
2543
+ "loss": 1.0291,
2544
+ "step": 3620
2545
+ },
2546
+ {
2547
+ "epoch": 2.484599589322382,
2548
+ "grad_norm": 17.33998680114746,
2549
+ "learning_rate": 8.601414556240018e-06,
2550
+ "loss": 1.0597,
2551
+ "step": 3630
2552
+ },
2553
+ {
2554
+ "epoch": 2.491444216290212,
2555
+ "grad_norm": 10.226778030395508,
2556
+ "learning_rate": 8.487337440109514e-06,
2557
+ "loss": 0.9635,
2558
+ "step": 3640
2559
+ },
2560
+ {
2561
+ "epoch": 2.4982888432580426,
2562
+ "grad_norm": 19.024782180786133,
2563
+ "learning_rate": 8.37326032397901e-06,
2564
+ "loss": 1.1721,
2565
+ "step": 3650
2566
+ },
2567
+ {
2568
+ "epoch": 2.5051334702258727,
2569
+ "grad_norm": 24.122180938720703,
2570
+ "learning_rate": 8.259183207848506e-06,
2571
+ "loss": 1.0413,
2572
+ "step": 3660
2573
+ },
2574
+ {
2575
+ "epoch": 2.511978097193703,
2576
+ "grad_norm": 18.6142578125,
2577
+ "learning_rate": 8.145106091718002e-06,
2578
+ "loss": 0.9202,
2579
+ "step": 3670
2580
+ },
2581
+ {
2582
+ "epoch": 2.518822724161533,
2583
+ "grad_norm": 16.318832397460938,
2584
+ "learning_rate": 8.031028975587497e-06,
2585
+ "loss": 0.9072,
2586
+ "step": 3680
2587
+ },
2588
+ {
2589
+ "epoch": 2.5256673511293632,
2590
+ "grad_norm": 15.191601753234863,
2591
+ "learning_rate": 7.916951859456993e-06,
2592
+ "loss": 1.0822,
2593
+ "step": 3690
2594
+ },
2595
+ {
2596
+ "epoch": 2.532511978097194,
2597
+ "grad_norm": 14.57149600982666,
2598
+ "learning_rate": 7.80287474332649e-06,
2599
+ "loss": 1.0479,
2600
+ "step": 3700
2601
+ },
2602
+ {
2603
+ "epoch": 2.539356605065024,
2604
+ "grad_norm": 15.488064765930176,
2605
+ "learning_rate": 7.688797627195985e-06,
2606
+ "loss": 0.8089,
2607
+ "step": 3710
2608
+ },
2609
+ {
2610
+ "epoch": 2.546201232032854,
2611
+ "grad_norm": 10.953666687011719,
2612
+ "learning_rate": 7.574720511065481e-06,
2613
+ "loss": 0.8452,
2614
+ "step": 3720
2615
+ },
2616
+ {
2617
+ "epoch": 2.5530458590006844,
2618
+ "grad_norm": 16.784223556518555,
2619
+ "learning_rate": 7.460643394934977e-06,
2620
+ "loss": 1.216,
2621
+ "step": 3730
2622
+ },
2623
+ {
2624
+ "epoch": 2.5598904859685145,
2625
+ "grad_norm": 18.735837936401367,
2626
+ "learning_rate": 7.346566278804472e-06,
2627
+ "loss": 1.0247,
2628
+ "step": 3740
2629
+ },
2630
+ {
2631
+ "epoch": 2.566735112936345,
2632
+ "grad_norm": 21.272205352783203,
2633
+ "learning_rate": 7.232489162673968e-06,
2634
+ "loss": 0.9197,
2635
+ "step": 3750
2636
+ },
2637
+ {
2638
+ "epoch": 2.5735797399041753,
2639
+ "grad_norm": 56.637420654296875,
2640
+ "learning_rate": 7.118412046543464e-06,
2641
+ "loss": 1.0617,
2642
+ "step": 3760
2643
+ },
2644
+ {
2645
+ "epoch": 2.5804243668720055,
2646
+ "grad_norm": 10.727712631225586,
2647
+ "learning_rate": 7.004334930412959e-06,
2648
+ "loss": 0.9164,
2649
+ "step": 3770
2650
+ },
2651
+ {
2652
+ "epoch": 2.5872689938398357,
2653
+ "grad_norm": 17.84894561767578,
2654
+ "learning_rate": 6.890257814282455e-06,
2655
+ "loss": 0.9836,
2656
+ "step": 3780
2657
+ },
2658
+ {
2659
+ "epoch": 2.594113620807666,
2660
+ "grad_norm": 12.271729469299316,
2661
+ "learning_rate": 6.7761806981519515e-06,
2662
+ "loss": 0.8887,
2663
+ "step": 3790
2664
+ },
2665
+ {
2666
+ "epoch": 2.6009582477754964,
2667
+ "grad_norm": 20.548677444458008,
2668
+ "learning_rate": 6.662103582021447e-06,
2669
+ "loss": 1.0237,
2670
+ "step": 3800
2671
+ },
2672
+ {
2673
+ "epoch": 2.6078028747433266,
2674
+ "grad_norm": 16.13056755065918,
2675
+ "learning_rate": 6.548026465890943e-06,
2676
+ "loss": 0.9189,
2677
+ "step": 3810
2678
+ },
2679
+ {
2680
+ "epoch": 2.614647501711157,
2681
+ "grad_norm": 8.427151679992676,
2682
+ "learning_rate": 6.433949349760439e-06,
2683
+ "loss": 1.0748,
2684
+ "step": 3820
2685
+ },
2686
+ {
2687
+ "epoch": 2.621492128678987,
2688
+ "grad_norm": 17.58341407775879,
2689
+ "learning_rate": 6.319872233629934e-06,
2690
+ "loss": 1.1057,
2691
+ "step": 3830
2692
+ },
2693
+ {
2694
+ "epoch": 2.628336755646817,
2695
+ "grad_norm": 27.294084548950195,
2696
+ "learning_rate": 6.20579511749943e-06,
2697
+ "loss": 0.9022,
2698
+ "step": 3840
2699
+ },
2700
+ {
2701
+ "epoch": 2.6351813826146477,
2702
+ "grad_norm": 16.329252243041992,
2703
+ "learning_rate": 6.091718001368925e-06,
2704
+ "loss": 1.0333,
2705
+ "step": 3850
2706
+ },
2707
+ {
2708
+ "epoch": 2.642026009582478,
2709
+ "grad_norm": 15.962593078613281,
2710
+ "learning_rate": 5.977640885238422e-06,
2711
+ "loss": 1.2656,
2712
+ "step": 3860
2713
+ },
2714
+ {
2715
+ "epoch": 2.648870636550308,
2716
+ "grad_norm": 16.888147354125977,
2717
+ "learning_rate": 5.863563769107917e-06,
2718
+ "loss": 1.1535,
2719
+ "step": 3870
2720
+ },
2721
+ {
2722
+ "epoch": 2.6557152635181382,
2723
+ "grad_norm": 23.648223876953125,
2724
+ "learning_rate": 5.7494866529774125e-06,
2725
+ "loss": 1.0494,
2726
+ "step": 3880
2727
+ },
2728
+ {
2729
+ "epoch": 2.6625598904859684,
2730
+ "grad_norm": 17.29507064819336,
2731
+ "learning_rate": 5.6354095368469094e-06,
2732
+ "loss": 0.9323,
2733
+ "step": 3890
2734
+ },
2735
+ {
2736
+ "epoch": 2.669404517453799,
2737
+ "grad_norm": 17.345674514770508,
2738
+ "learning_rate": 5.521332420716405e-06,
2739
+ "loss": 1.0372,
2740
+ "step": 3900
2741
+ },
2742
+ {
2743
+ "epoch": 2.6762491444216288,
2744
+ "grad_norm": 19.165477752685547,
2745
+ "learning_rate": 5.4072553045859e-06,
2746
+ "loss": 0.9392,
2747
+ "step": 3910
2748
+ },
2749
+ {
2750
+ "epoch": 2.6830937713894594,
2751
+ "grad_norm": 7.988308906555176,
2752
+ "learning_rate": 5.293178188455396e-06,
2753
+ "loss": 1.0524,
2754
+ "step": 3920
2755
+ },
2756
+ {
2757
+ "epoch": 2.6899383983572895,
2758
+ "grad_norm": 7.466772079467773,
2759
+ "learning_rate": 5.179101072324892e-06,
2760
+ "loss": 0.9797,
2761
+ "step": 3930
2762
+ },
2763
+ {
2764
+ "epoch": 2.6967830253251197,
2765
+ "grad_norm": 17.31673812866211,
2766
+ "learning_rate": 5.065023956194387e-06,
2767
+ "loss": 1.2455,
2768
+ "step": 3940
2769
+ },
2770
+ {
2771
+ "epoch": 2.70362765229295,
2772
+ "grad_norm": 18.40925407409668,
2773
+ "learning_rate": 4.950946840063883e-06,
2774
+ "loss": 0.9221,
2775
+ "step": 3950
2776
+ },
2777
+ {
2778
+ "epoch": 2.71047227926078,
2779
+ "grad_norm": 12.295755386352539,
2780
+ "learning_rate": 4.836869723933379e-06,
2781
+ "loss": 0.9398,
2782
+ "step": 3960
2783
+ },
2784
+ {
2785
+ "epoch": 2.7173169062286107,
2786
+ "grad_norm": 24.060033798217773,
2787
+ "learning_rate": 4.722792607802875e-06,
2788
+ "loss": 0.9803,
2789
+ "step": 3970
2790
+ },
2791
+ {
2792
+ "epoch": 2.724161533196441,
2793
+ "grad_norm": 11.894405364990234,
2794
+ "learning_rate": 4.6087154916723705e-06,
2795
+ "loss": 0.8377,
2796
+ "step": 3980
2797
+ },
2798
+ {
2799
+ "epoch": 2.731006160164271,
2800
+ "grad_norm": 18.28839683532715,
2801
+ "learning_rate": 4.4946383755418666e-06,
2802
+ "loss": 1.0161,
2803
+ "step": 3990
2804
+ },
2805
+ {
2806
+ "epoch": 2.737850787132101,
2807
+ "grad_norm": 14.643180847167969,
2808
+ "learning_rate": 4.380561259411363e-06,
2809
+ "loss": 0.9987,
2810
+ "step": 4000
2811
+ },
2812
+ {
2813
+ "epoch": 2.7446954140999313,
2814
+ "grad_norm": 18.306209564208984,
2815
+ "learning_rate": 4.266484143280858e-06,
2816
+ "loss": 1.0731,
2817
+ "step": 4010
2818
+ },
2819
+ {
2820
+ "epoch": 2.751540041067762,
2821
+ "grad_norm": 18.605947494506836,
2822
+ "learning_rate": 4.152407027150354e-06,
2823
+ "loss": 1.0158,
2824
+ "step": 4020
2825
+ },
2826
+ {
2827
+ "epoch": 2.758384668035592,
2828
+ "grad_norm": 19.448841094970703,
2829
+ "learning_rate": 4.03832991101985e-06,
2830
+ "loss": 1.0965,
2831
+ "step": 4030
2832
+ },
2833
+ {
2834
+ "epoch": 2.7652292950034223,
2835
+ "grad_norm": 25.54683494567871,
2836
+ "learning_rate": 3.924252794889345e-06,
2837
+ "loss": 0.9571,
2838
+ "step": 4040
2839
+ },
2840
+ {
2841
+ "epoch": 2.7720739219712525,
2842
+ "grad_norm": 13.25411319732666,
2843
+ "learning_rate": 3.810175678758841e-06,
2844
+ "loss": 0.9362,
2845
+ "step": 4050
2846
+ },
2847
+ {
2848
+ "epoch": 2.7789185489390826,
2849
+ "grad_norm": 9.98473072052002,
2850
+ "learning_rate": 3.696098562628337e-06,
2851
+ "loss": 1.029,
2852
+ "step": 4060
2853
+ },
2854
+ {
2855
+ "epoch": 2.7857631759069132,
2856
+ "grad_norm": 9.64309310913086,
2857
+ "learning_rate": 3.5820214464978324e-06,
2858
+ "loss": 0.8301,
2859
+ "step": 4070
2860
+ },
2861
+ {
2862
+ "epoch": 2.7926078028747434,
2863
+ "grad_norm": 11.841326713562012,
2864
+ "learning_rate": 3.4679443303673285e-06,
2865
+ "loss": 1.0573,
2866
+ "step": 4080
2867
+ },
2868
+ {
2869
+ "epoch": 2.7994524298425736,
2870
+ "grad_norm": 26.954729080200195,
2871
+ "learning_rate": 3.353867214236824e-06,
2872
+ "loss": 1.0725,
2873
+ "step": 4090
2874
+ },
2875
+ {
2876
+ "epoch": 2.8062970568104038,
2877
+ "grad_norm": 18.035999298095703,
2878
+ "learning_rate": 3.2397900981063197e-06,
2879
+ "loss": 0.9733,
2880
+ "step": 4100
2881
+ },
2882
+ {
2883
+ "epoch": 2.813141683778234,
2884
+ "grad_norm": 14.014225006103516,
2885
+ "learning_rate": 3.1257129819758162e-06,
2886
+ "loss": 1.0049,
2887
+ "step": 4110
2888
+ },
2889
+ {
2890
+ "epoch": 2.8199863107460645,
2891
+ "grad_norm": 17.872325897216797,
2892
+ "learning_rate": 3.0116358658453114e-06,
2893
+ "loss": 1.1936,
2894
+ "step": 4120
2895
+ },
2896
+ {
2897
+ "epoch": 2.8268309377138947,
2898
+ "grad_norm": 12.128450393676758,
2899
+ "learning_rate": 2.8975587497148075e-06,
2900
+ "loss": 0.9367,
2901
+ "step": 4130
2902
+ },
2903
+ {
2904
+ "epoch": 2.833675564681725,
2905
+ "grad_norm": 24.44780158996582,
2906
+ "learning_rate": 2.783481633584303e-06,
2907
+ "loss": 0.8604,
2908
+ "step": 4140
2909
+ },
2910
+ {
2911
+ "epoch": 2.840520191649555,
2912
+ "grad_norm": 20.578285217285156,
2913
+ "learning_rate": 2.6694045174537987e-06,
2914
+ "loss": 1.0186,
2915
+ "step": 4150
2916
+ },
2917
+ {
2918
+ "epoch": 2.847364818617385,
2919
+ "grad_norm": 13.210015296936035,
2920
+ "learning_rate": 2.555327401323295e-06,
2921
+ "loss": 0.9702,
2922
+ "step": 4160
2923
+ },
2924
+ {
2925
+ "epoch": 2.854209445585216,
2926
+ "grad_norm": 20.814693450927734,
2927
+ "learning_rate": 2.4412502851927904e-06,
2928
+ "loss": 1.0134,
2929
+ "step": 4170
2930
+ },
2931
+ {
2932
+ "epoch": 2.861054072553046,
2933
+ "grad_norm": 23.13202667236328,
2934
+ "learning_rate": 2.327173169062286e-06,
2935
+ "loss": 1.0246,
2936
+ "step": 4180
2937
+ },
2938
+ {
2939
+ "epoch": 2.867898699520876,
2940
+ "grad_norm": 14.703669548034668,
2941
+ "learning_rate": 2.213096052931782e-06,
2942
+ "loss": 0.8479,
2943
+ "step": 4190
2944
+ },
2945
+ {
2946
+ "epoch": 2.8747433264887063,
2947
+ "grad_norm": 17.403057098388672,
2948
+ "learning_rate": 2.0990189368012777e-06,
2949
+ "loss": 1.0578,
2950
+ "step": 4200
2951
+ },
2952
+ {
2953
+ "epoch": 2.8815879534565365,
2954
+ "grad_norm": 23.017269134521484,
2955
+ "learning_rate": 1.9849418206707738e-06,
2956
+ "loss": 0.8832,
2957
+ "step": 4210
2958
+ },
2959
+ {
2960
+ "epoch": 2.888432580424367,
2961
+ "grad_norm": 26.879880905151367,
2962
+ "learning_rate": 1.8708647045402694e-06,
2963
+ "loss": 1.0184,
2964
+ "step": 4220
2965
+ },
2966
+ {
2967
+ "epoch": 2.895277207392197,
2968
+ "grad_norm": 5.334693431854248,
2969
+ "learning_rate": 1.756787588409765e-06,
2970
+ "loss": 0.7945,
2971
+ "step": 4230
2972
+ },
2973
+ {
2974
+ "epoch": 2.9021218343600275,
2975
+ "grad_norm": 18.800867080688477,
2976
+ "learning_rate": 1.6427104722792609e-06,
2977
+ "loss": 1.0574,
2978
+ "step": 4240
2979
+ },
2980
+ {
2981
+ "epoch": 2.9089664613278576,
2982
+ "grad_norm": 23.8500919342041,
2983
+ "learning_rate": 1.5286333561487567e-06,
2984
+ "loss": 0.9234,
2985
+ "step": 4250
2986
+ },
2987
+ {
2988
+ "epoch": 2.915811088295688,
2989
+ "grad_norm": 19.745752334594727,
2990
+ "learning_rate": 1.4145562400182523e-06,
2991
+ "loss": 0.8584,
2992
+ "step": 4260
2993
+ },
2994
+ {
2995
+ "epoch": 2.922655715263518,
2996
+ "grad_norm": 10.134031295776367,
2997
+ "learning_rate": 1.3004791238877482e-06,
2998
+ "loss": 0.8596,
2999
+ "step": 4270
3000
+ },
3001
+ {
3002
+ "epoch": 2.929500342231348,
3003
+ "grad_norm": 17.87618637084961,
3004
+ "learning_rate": 1.186402007757244e-06,
3005
+ "loss": 1.044,
3006
+ "step": 4280
3007
+ },
3008
+ {
3009
+ "epoch": 2.9363449691991788,
3010
+ "grad_norm": 37.101905822753906,
3011
+ "learning_rate": 1.0723248916267397e-06,
3012
+ "loss": 1.0645,
3013
+ "step": 4290
3014
+ },
3015
+ {
3016
+ "epoch": 2.943189596167009,
3017
+ "grad_norm": 14.37732982635498,
3018
+ "learning_rate": 9.582477754962355e-07,
3019
+ "loss": 1.0216,
3020
+ "step": 4300
3021
+ },
3022
+ {
3023
+ "epoch": 2.950034223134839,
3024
+ "grad_norm": 19.091243743896484,
3025
+ "learning_rate": 8.441706593657313e-07,
3026
+ "loss": 0.9481,
3027
+ "step": 4310
3028
+ },
3029
+ {
3030
+ "epoch": 2.9568788501026693,
3031
+ "grad_norm": 14.882901191711426,
3032
+ "learning_rate": 7.300935432352271e-07,
3033
+ "loss": 0.9584,
3034
+ "step": 4320
3035
+ },
3036
+ {
3037
+ "epoch": 2.9637234770704994,
3038
+ "grad_norm": 8.84882926940918,
3039
+ "learning_rate": 6.160164271047228e-07,
3040
+ "loss": 0.8511,
3041
+ "step": 4330
3042
+ },
3043
+ {
3044
+ "epoch": 2.97056810403833,
3045
+ "grad_norm": 13.387128829956055,
3046
+ "learning_rate": 5.019393109742185e-07,
3047
+ "loss": 1.0989,
3048
+ "step": 4340
3049
+ },
3050
+ {
3051
+ "epoch": 2.97741273100616,
3052
+ "grad_norm": 12.08794116973877,
3053
+ "learning_rate": 3.878621948437144e-07,
3054
+ "loss": 0.9652,
3055
+ "step": 4350
3056
+ },
3057
+ {
3058
+ "epoch": 2.9842573579739904,
3059
+ "grad_norm": 19.42278480529785,
3060
+ "learning_rate": 2.7378507871321016e-07,
3061
+ "loss": 1.0697,
3062
+ "step": 4360
3063
+ },
3064
+ {
3065
+ "epoch": 2.9911019849418206,
3066
+ "grad_norm": 12.420310974121094,
3067
+ "learning_rate": 1.5970796258270592e-07,
3068
+ "loss": 0.9824,
3069
+ "step": 4370
3070
+ },
3071
+ {
3072
+ "epoch": 2.9979466119096507,
3073
+ "grad_norm": 17.423847198486328,
3074
+ "learning_rate": 4.563084645220169e-08,
3075
+ "loss": 1.0081,
3076
+ "step": 4380
3077
+ }
3078
+ ],
3079
+ "logging_steps": 10,
3080
+ "max_steps": 4383,
3081
+ "num_input_tokens_seen": 0,
3082
+ "num_train_epochs": 3,
3083
+ "save_steps": 500,
3084
+ "stateful_callbacks": {
3085
+ "TrainerControl": {
3086
+ "args": {
3087
+ "should_epoch_stop": false,
3088
+ "should_evaluate": false,
3089
+ "should_log": false,
3090
+ "should_save": true,
3091
+ "should_training_stop": true
3092
+ },
3093
+ "attributes": {}
3094
+ }
3095
+ },
3096
+ "total_flos": 2305112266591488.0,
3097
+ "train_batch_size": 8,
3098
+ "trial_name": null,
3099
+ "trial_params": null
3100
+ }
checkpoint-4383/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b40cdeb7ac4051db3a8aa193f51121d624fbfd7ad0fb77b13a6c94f0eb02c2c7
3
+ size 5240
checkpoint-4383/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertForSequenceClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "classifier_dropout": null,
7
+ "gradient_checkpointing": false,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 768,
11
+ "id2label": {
12
+ "0": "LABEL_0",
13
+ "1": "LABEL_1",
14
+ "2": "LABEL_2",
15
+ "3": "LABEL_3",
16
+ "4": "LABEL_4"
17
+ },
18
+ "initializer_range": 0.02,
19
+ "intermediate_size": 3072,
20
+ "label2id": {
21
+ "LABEL_0": 0,
22
+ "LABEL_1": 1,
23
+ "LABEL_2": 2,
24
+ "LABEL_3": 3,
25
+ "LABEL_4": 4
26
+ },
27
+ "layer_norm_eps": 1e-12,
28
+ "max_position_embeddings": 512,
29
+ "model_type": "bert",
30
+ "num_attention_heads": 12,
31
+ "num_hidden_layers": 12,
32
+ "pad_token_id": 0,
33
+ "position_embedding_type": "absolute",
34
+ "problem_type": "single_label_classification",
35
+ "torch_dtype": "float32",
36
+ "transformers_version": "4.51.0",
37
+ "type_vocab_size": 2,
38
+ "use_cache": true,
39
+ "vocab_size": 30522
40
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aaa4eb908d85996f72b3a3ac68eb9233eec05e7080b358420644dc0bef050fba
3
+ size 437967876
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": true,
45
+ "cls_token": "[CLS]",
46
+ "do_basic_tokenize": true,
47
+ "do_lower_case": true,
48
+ "extra_special_tokens": {},
49
+ "mask_token": "[MASK]",
50
+ "model_max_length": 512,
51
+ "never_split": null,
52
+ "pad_token": "[PAD]",
53
+ "sep_token": "[SEP]",
54
+ "strip_accents": null,
55
+ "tokenize_chinese_chars": true,
56
+ "tokenizer_class": "BertTokenizer",
57
+ "unk_token": "[UNK]"
58
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b40cdeb7ac4051db3a8aa193f51121d624fbfd7ad0fb77b13a6c94f0eb02c2c7
3
+ size 5240
vocab.txt ADDED
The diff for this file is too large to render. See raw diff