MarkProMaster229 commited on
Commit
e8ee6c9
·
verified ·
1 Parent(s): 5bfe925

Upload 13 files

Browse files
config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "architectures": [
4
+ "GPT2LMHeadModel"
5
+ ],
6
+ "attn_pdrop": 0.1,
7
+ "bos_token_id": 1,
8
+ "dtype": "float32",
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 2,
11
+ "gradient_checkpointing": false,
12
+ "id2label": {
13
+ "0": "LABEL_0"
14
+ },
15
+ "initializer_range": 0.02,
16
+ "label2id": {
17
+ "LABEL_0": 0
18
+ },
19
+ "layer_norm_epsilon": 1e-05,
20
+ "model_type": "gpt2",
21
+ "n_ctx": 2048,
22
+ "n_embd": 768,
23
+ "n_head": 12,
24
+ "n_inner": null,
25
+ "n_layer": 12,
26
+ "n_positions": 2048,
27
+ "pad_token_id": 2,
28
+ "reorder_and_upcast_attn": false,
29
+ "resid_pdrop": 0.1,
30
+ "scale_attn_by_inverse_layer_idx": false,
31
+ "scale_attn_weights": true,
32
+ "summary_activation": null,
33
+ "summary_first_dropout": 0.1,
34
+ "summary_proj_to_labels": true,
35
+ "summary_type": "cls_index",
36
+ "summary_use_proj": true,
37
+ "transformers_version": "4.57.3",
38
+ "use_cache": true,
39
+ "vocab_size": 50257
40
+ }
generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": [
5
+ 2
6
+ ],
7
+ "pad_token_id": 2,
8
+ "transformers_version": "4.57.3"
9
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:941af006c53b04c8b3d79d6a92c9d95357ad1a9d1f0e35a3fe441bd7b3025b7b
3
+ size 500919936
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b7c46511f5499ee004484bc8087636f995b11dfdc79096308d94a3e514e9a87
3
+ size 1001933754
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b134f8e3af884ecbd7d1e2ca01867adadd87959eaa2547534fb7d1ad32f8a29
3
+ size 14244
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d90d45eba9611e74e6ec237c3541e68dbb441dd6187112cfd8db9bbad5d0e7af
3
+ size 1064
special_tokens_map.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "mask_token": {
17
+ "content": "<mask>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "pad_token": {
24
+ "content": "</s>",
25
+ "lstrip": false,
26
+ "normalized": true,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "unk_token": {
31
+ "content": "<unk>",
32
+ "lstrip": false,
33
+ "normalized": true,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<pad>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<s>",
15
+ "lstrip": false,
16
+ "normalized": true,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "</s>",
23
+ "lstrip": false,
24
+ "normalized": true,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "3": {
30
+ "content": "<unk>",
31
+ "lstrip": false,
32
+ "normalized": true,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "4": {
38
+ "content": "<mask>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ }
45
+ },
46
+ "bos_token": "<s>",
47
+ "clean_up_tokenization_spaces": true,
48
+ "eos_token": "</s>",
49
+ "errors": "replace",
50
+ "extra_special_tokens": {},
51
+ "mask_token": "<mask>",
52
+ "max_length": 128,
53
+ "model_max_length": 2048,
54
+ "pad_to_multiple_of": null,
55
+ "pad_token": "</s>",
56
+ "pad_token_type_id": 0,
57
+ "padding_side": "left",
58
+ "stride": 0,
59
+ "tokenizer_class": "GPT2Tokenizer",
60
+ "truncation_side": "left",
61
+ "truncation_strategy": "longest_first",
62
+ "trust_remote_code": false,
63
+ "unk_token": "<unk>"
64
+ }
trainer_state.json ADDED
@@ -0,0 +1,2036 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.0,
6
+ "eval_steps": 500,
7
+ "global_step": 28604,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.0034960145434205004,
14
+ "grad_norm": 3.56770658493042,
15
+ "learning_rate": 4.982694728010069e-05,
16
+ "loss": 5.1215,
17
+ "step": 100
18
+ },
19
+ {
20
+ "epoch": 0.006992029086841001,
21
+ "grad_norm": 3.0420525074005127,
22
+ "learning_rate": 4.965214655292966e-05,
23
+ "loss": 5.1137,
24
+ "step": 200
25
+ },
26
+ {
27
+ "epoch": 0.010488043630261502,
28
+ "grad_norm": 3.0995962619781494,
29
+ "learning_rate": 4.9477345825758635e-05,
30
+ "loss": 5.1482,
31
+ "step": 300
32
+ },
33
+ {
34
+ "epoch": 0.013984058173682002,
35
+ "grad_norm": 2.8879716396331787,
36
+ "learning_rate": 4.930254509858761e-05,
37
+ "loss": 5.1249,
38
+ "step": 400
39
+ },
40
+ {
41
+ "epoch": 0.0174800727171025,
42
+ "grad_norm": 2.7549917697906494,
43
+ "learning_rate": 4.912774437141659e-05,
44
+ "loss": 5.0889,
45
+ "step": 500
46
+ },
47
+ {
48
+ "epoch": 0.020976087260523003,
49
+ "grad_norm": 2.814000129699707,
50
+ "learning_rate": 4.895294364424556e-05,
51
+ "loss": 5.1358,
52
+ "step": 600
53
+ },
54
+ {
55
+ "epoch": 0.024472101803943505,
56
+ "grad_norm": 2.6768572330474854,
57
+ "learning_rate": 4.877814291707454e-05,
58
+ "loss": 5.1105,
59
+ "step": 700
60
+ },
61
+ {
62
+ "epoch": 0.027968116347364003,
63
+ "grad_norm": 2.678793430328369,
64
+ "learning_rate": 4.860334218990351e-05,
65
+ "loss": 5.1058,
66
+ "step": 800
67
+ },
68
+ {
69
+ "epoch": 0.03146413089078451,
70
+ "grad_norm": 3.129312753677368,
71
+ "learning_rate": 4.8428541462732484e-05,
72
+ "loss": 5.1086,
73
+ "step": 900
74
+ },
75
+ {
76
+ "epoch": 0.034960145434205,
77
+ "grad_norm": 2.9247419834136963,
78
+ "learning_rate": 4.8253740735561464e-05,
79
+ "loss": 5.1296,
80
+ "step": 1000
81
+ },
82
+ {
83
+ "epoch": 0.038456159977625505,
84
+ "grad_norm": 2.542954206466675,
85
+ "learning_rate": 4.807894000839044e-05,
86
+ "loss": 5.0857,
87
+ "step": 1100
88
+ },
89
+ {
90
+ "epoch": 0.04195217452104601,
91
+ "grad_norm": 2.79154372215271,
92
+ "learning_rate": 4.790413928121941e-05,
93
+ "loss": 5.1134,
94
+ "step": 1200
95
+ },
96
+ {
97
+ "epoch": 0.04544818906446651,
98
+ "grad_norm": 2.6644339561462402,
99
+ "learning_rate": 4.7729338554048386e-05,
100
+ "loss": 5.105,
101
+ "step": 1300
102
+ },
103
+ {
104
+ "epoch": 0.04894420360788701,
105
+ "grad_norm": 2.7891204357147217,
106
+ "learning_rate": 4.755453782687736e-05,
107
+ "loss": 5.0809,
108
+ "step": 1400
109
+ },
110
+ {
111
+ "epoch": 0.05244021815130751,
112
+ "grad_norm": 2.664785623550415,
113
+ "learning_rate": 4.737973709970634e-05,
114
+ "loss": 5.0933,
115
+ "step": 1500
116
+ },
117
+ {
118
+ "epoch": 0.05593623269472801,
119
+ "grad_norm": 3.264219284057617,
120
+ "learning_rate": 4.720493637253531e-05,
121
+ "loss": 5.1067,
122
+ "step": 1600
123
+ },
124
+ {
125
+ "epoch": 0.05943224723814851,
126
+ "grad_norm": 2.754682779312134,
127
+ "learning_rate": 4.703013564536429e-05,
128
+ "loss": 5.0338,
129
+ "step": 1700
130
+ },
131
+ {
132
+ "epoch": 0.06292826178156902,
133
+ "grad_norm": 2.821290969848633,
134
+ "learning_rate": 4.685533491819326e-05,
135
+ "loss": 5.0775,
136
+ "step": 1800
137
+ },
138
+ {
139
+ "epoch": 0.06642427632498951,
140
+ "grad_norm": 2.995471954345703,
141
+ "learning_rate": 4.6680534191022234e-05,
142
+ "loss": 5.0705,
143
+ "step": 1900
144
+ },
145
+ {
146
+ "epoch": 0.06992029086841,
147
+ "grad_norm": 2.7494113445281982,
148
+ "learning_rate": 4.6505733463851215e-05,
149
+ "loss": 5.0112,
150
+ "step": 2000
151
+ },
152
+ {
153
+ "epoch": 0.07341630541183052,
154
+ "grad_norm": 3.065732479095459,
155
+ "learning_rate": 4.633093273668019e-05,
156
+ "loss": 5.1087,
157
+ "step": 2100
158
+ },
159
+ {
160
+ "epoch": 0.07691231995525101,
161
+ "grad_norm": 2.742866039276123,
162
+ "learning_rate": 4.615613200950916e-05,
163
+ "loss": 5.0839,
164
+ "step": 2200
165
+ },
166
+ {
167
+ "epoch": 0.08040833449867152,
168
+ "grad_norm": 2.7256627082824707,
169
+ "learning_rate": 4.5981331282338136e-05,
170
+ "loss": 5.0284,
171
+ "step": 2300
172
+ },
173
+ {
174
+ "epoch": 0.08390434904209201,
175
+ "grad_norm": 2.7911083698272705,
176
+ "learning_rate": 4.580653055516711e-05,
177
+ "loss": 5.0478,
178
+ "step": 2400
179
+ },
180
+ {
181
+ "epoch": 0.08740036358551252,
182
+ "grad_norm": 2.8085787296295166,
183
+ "learning_rate": 4.563172982799609e-05,
184
+ "loss": 5.0503,
185
+ "step": 2500
186
+ },
187
+ {
188
+ "epoch": 0.09089637812893302,
189
+ "grad_norm": 2.870654344558716,
190
+ "learning_rate": 4.5456929100825064e-05,
191
+ "loss": 5.0605,
192
+ "step": 2600
193
+ },
194
+ {
195
+ "epoch": 0.09439239267235351,
196
+ "grad_norm": 2.848191976547241,
197
+ "learning_rate": 4.528212837365404e-05,
198
+ "loss": 5.0471,
199
+ "step": 2700
200
+ },
201
+ {
202
+ "epoch": 0.09788840721577402,
203
+ "grad_norm": 2.4951555728912354,
204
+ "learning_rate": 4.510732764648301e-05,
205
+ "loss": 5.0179,
206
+ "step": 2800
207
+ },
208
+ {
209
+ "epoch": 0.10138442175919452,
210
+ "grad_norm": 2.7134485244750977,
211
+ "learning_rate": 4.4932526919311985e-05,
212
+ "loss": 5.0412,
213
+ "step": 2900
214
+ },
215
+ {
216
+ "epoch": 0.10488043630261502,
217
+ "grad_norm": 3.0360782146453857,
218
+ "learning_rate": 4.4757726192140965e-05,
219
+ "loss": 5.0416,
220
+ "step": 3000
221
+ },
222
+ {
223
+ "epoch": 0.10837645084603552,
224
+ "grad_norm": 2.8149096965789795,
225
+ "learning_rate": 4.458292546496994e-05,
226
+ "loss": 5.0165,
227
+ "step": 3100
228
+ },
229
+ {
230
+ "epoch": 0.11187246538945601,
231
+ "grad_norm": 3.08152174949646,
232
+ "learning_rate": 4.440812473779891e-05,
233
+ "loss": 4.9924,
234
+ "step": 3200
235
+ },
236
+ {
237
+ "epoch": 0.11536847993287652,
238
+ "grad_norm": 2.820800542831421,
239
+ "learning_rate": 4.4233324010627886e-05,
240
+ "loss": 5.0404,
241
+ "step": 3300
242
+ },
243
+ {
244
+ "epoch": 0.11886449447629702,
245
+ "grad_norm": 2.6280956268310547,
246
+ "learning_rate": 4.405852328345686e-05,
247
+ "loss": 5.0052,
248
+ "step": 3400
249
+ },
250
+ {
251
+ "epoch": 0.12236050901971753,
252
+ "grad_norm": 2.5183768272399902,
253
+ "learning_rate": 4.388372255628584e-05,
254
+ "loss": 5.0197,
255
+ "step": 3500
256
+ },
257
+ {
258
+ "epoch": 0.12585652356313803,
259
+ "grad_norm": 2.802274465560913,
260
+ "learning_rate": 4.3708921829114814e-05,
261
+ "loss": 4.99,
262
+ "step": 3600
263
+ },
264
+ {
265
+ "epoch": 0.12935253810655853,
266
+ "grad_norm": 2.8082516193389893,
267
+ "learning_rate": 4.353412110194379e-05,
268
+ "loss": 4.998,
269
+ "step": 3700
270
+ },
271
+ {
272
+ "epoch": 0.13284855264997902,
273
+ "grad_norm": 2.8567910194396973,
274
+ "learning_rate": 4.335932037477276e-05,
275
+ "loss": 5.007,
276
+ "step": 3800
277
+ },
278
+ {
279
+ "epoch": 0.13634456719339952,
280
+ "grad_norm": 2.6747946739196777,
281
+ "learning_rate": 4.3184519647601735e-05,
282
+ "loss": 5.0372,
283
+ "step": 3900
284
+ },
285
+ {
286
+ "epoch": 0.13984058173682,
287
+ "grad_norm": 3.1875665187835693,
288
+ "learning_rate": 4.3009718920430715e-05,
289
+ "loss": 5.008,
290
+ "step": 4000
291
+ },
292
+ {
293
+ "epoch": 0.14333659628024054,
294
+ "grad_norm": 2.5646297931671143,
295
+ "learning_rate": 4.283491819325969e-05,
296
+ "loss": 5.0014,
297
+ "step": 4100
298
+ },
299
+ {
300
+ "epoch": 0.14683261082366103,
301
+ "grad_norm": 2.9062674045562744,
302
+ "learning_rate": 4.266011746608866e-05,
303
+ "loss": 4.999,
304
+ "step": 4200
305
+ },
306
+ {
307
+ "epoch": 0.15032862536708153,
308
+ "grad_norm": 2.6203787326812744,
309
+ "learning_rate": 4.2485316738917637e-05,
310
+ "loss": 4.9994,
311
+ "step": 4300
312
+ },
313
+ {
314
+ "epoch": 0.15382463991050202,
315
+ "grad_norm": 2.530931234359741,
316
+ "learning_rate": 4.231051601174661e-05,
317
+ "loss": 4.9835,
318
+ "step": 4400
319
+ },
320
+ {
321
+ "epoch": 0.15732065445392251,
322
+ "grad_norm": 2.436789035797119,
323
+ "learning_rate": 4.2135715284575584e-05,
324
+ "loss": 4.9711,
325
+ "step": 4500
326
+ },
327
+ {
328
+ "epoch": 0.16081666899734304,
329
+ "grad_norm": 2.867676019668579,
330
+ "learning_rate": 4.1960914557404564e-05,
331
+ "loss": 4.9843,
332
+ "step": 4600
333
+ },
334
+ {
335
+ "epoch": 0.16431268354076353,
336
+ "grad_norm": 3.0732126235961914,
337
+ "learning_rate": 4.178611383023354e-05,
338
+ "loss": 4.9614,
339
+ "step": 4700
340
+ },
341
+ {
342
+ "epoch": 0.16780869808418403,
343
+ "grad_norm": 2.7034592628479004,
344
+ "learning_rate": 4.161131310306251e-05,
345
+ "loss": 5.0208,
346
+ "step": 4800
347
+ },
348
+ {
349
+ "epoch": 0.17130471262760452,
350
+ "grad_norm": 2.957083225250244,
351
+ "learning_rate": 4.1436512375891485e-05,
352
+ "loss": 4.9884,
353
+ "step": 4900
354
+ },
355
+ {
356
+ "epoch": 0.17480072717102504,
357
+ "grad_norm": 2.7425382137298584,
358
+ "learning_rate": 4.126171164872046e-05,
359
+ "loss": 4.9781,
360
+ "step": 5000
361
+ },
362
+ {
363
+ "epoch": 0.17829674171444554,
364
+ "grad_norm": 2.872343063354492,
365
+ "learning_rate": 4.108691092154944e-05,
366
+ "loss": 5.0063,
367
+ "step": 5100
368
+ },
369
+ {
370
+ "epoch": 0.18179275625786603,
371
+ "grad_norm": 2.698291301727295,
372
+ "learning_rate": 4.091211019437841e-05,
373
+ "loss": 5.0118,
374
+ "step": 5200
375
+ },
376
+ {
377
+ "epoch": 0.18528877080128653,
378
+ "grad_norm": 2.5991404056549072,
379
+ "learning_rate": 4.073730946720739e-05,
380
+ "loss": 4.9926,
381
+ "step": 5300
382
+ },
383
+ {
384
+ "epoch": 0.18878478534470702,
385
+ "grad_norm": 2.681043863296509,
386
+ "learning_rate": 4.056250874003636e-05,
387
+ "loss": 4.9436,
388
+ "step": 5400
389
+ },
390
+ {
391
+ "epoch": 0.19228079988812755,
392
+ "grad_norm": 2.5356483459472656,
393
+ "learning_rate": 4.0387708012865334e-05,
394
+ "loss": 4.9587,
395
+ "step": 5500
396
+ },
397
+ {
398
+ "epoch": 0.19577681443154804,
399
+ "grad_norm": 2.8511860370635986,
400
+ "learning_rate": 4.0212907285694315e-05,
401
+ "loss": 4.9898,
402
+ "step": 5600
403
+ },
404
+ {
405
+ "epoch": 0.19927282897496854,
406
+ "grad_norm": 2.5632340908050537,
407
+ "learning_rate": 4.003810655852329e-05,
408
+ "loss": 4.9834,
409
+ "step": 5700
410
+ },
411
+ {
412
+ "epoch": 0.20276884351838903,
413
+ "grad_norm": 3.114943742752075,
414
+ "learning_rate": 3.986330583135226e-05,
415
+ "loss": 4.9516,
416
+ "step": 5800
417
+ },
418
+ {
419
+ "epoch": 0.20626485806180953,
420
+ "grad_norm": 2.759326457977295,
421
+ "learning_rate": 3.9688505104181236e-05,
422
+ "loss": 5.0019,
423
+ "step": 5900
424
+ },
425
+ {
426
+ "epoch": 0.20976087260523005,
427
+ "grad_norm": 2.8129642009735107,
428
+ "learning_rate": 3.951370437701021e-05,
429
+ "loss": 4.9568,
430
+ "step": 6000
431
+ },
432
+ {
433
+ "epoch": 0.21325688714865054,
434
+ "grad_norm": 2.8741278648376465,
435
+ "learning_rate": 3.933890364983919e-05,
436
+ "loss": 4.9049,
437
+ "step": 6100
438
+ },
439
+ {
440
+ "epoch": 0.21675290169207104,
441
+ "grad_norm": 2.5861387252807617,
442
+ "learning_rate": 3.9164102922668164e-05,
443
+ "loss": 5.0054,
444
+ "step": 6200
445
+ },
446
+ {
447
+ "epoch": 0.22024891623549153,
448
+ "grad_norm": 2.668247938156128,
449
+ "learning_rate": 3.898930219549714e-05,
450
+ "loss": 4.9488,
451
+ "step": 6300
452
+ },
453
+ {
454
+ "epoch": 0.22374493077891203,
455
+ "grad_norm": 2.8038411140441895,
456
+ "learning_rate": 3.881450146832611e-05,
457
+ "loss": 4.9846,
458
+ "step": 6400
459
+ },
460
+ {
461
+ "epoch": 0.22724094532233255,
462
+ "grad_norm": 2.64792799949646,
463
+ "learning_rate": 3.8639700741155085e-05,
464
+ "loss": 4.9734,
465
+ "step": 6500
466
+ },
467
+ {
468
+ "epoch": 0.23073695986575304,
469
+ "grad_norm": 3.3240697383880615,
470
+ "learning_rate": 3.8464900013984065e-05,
471
+ "loss": 4.9348,
472
+ "step": 6600
473
+ },
474
+ {
475
+ "epoch": 0.23423297440917354,
476
+ "grad_norm": 2.762397050857544,
477
+ "learning_rate": 3.829009928681304e-05,
478
+ "loss": 4.9853,
479
+ "step": 6700
480
+ },
481
+ {
482
+ "epoch": 0.23772898895259403,
483
+ "grad_norm": 2.5263381004333496,
484
+ "learning_rate": 3.811529855964201e-05,
485
+ "loss": 4.9199,
486
+ "step": 6800
487
+ },
488
+ {
489
+ "epoch": 0.24122500349601456,
490
+ "grad_norm": 3.1513848304748535,
491
+ "learning_rate": 3.7940497832470986e-05,
492
+ "loss": 4.9958,
493
+ "step": 6900
494
+ },
495
+ {
496
+ "epoch": 0.24472101803943505,
497
+ "grad_norm": 2.661020278930664,
498
+ "learning_rate": 3.776569710529996e-05,
499
+ "loss": 4.9228,
500
+ "step": 7000
501
+ },
502
+ {
503
+ "epoch": 0.24821703258285555,
504
+ "grad_norm": 2.742587089538574,
505
+ "learning_rate": 3.759089637812894e-05,
506
+ "loss": 4.9588,
507
+ "step": 7100
508
+ },
509
+ {
510
+ "epoch": 0.25171304712627607,
511
+ "grad_norm": 2.8065550327301025,
512
+ "learning_rate": 3.7416095650957914e-05,
513
+ "loss": 4.9245,
514
+ "step": 7200
515
+ },
516
+ {
517
+ "epoch": 0.25520906166969654,
518
+ "grad_norm": 2.639291524887085,
519
+ "learning_rate": 3.724129492378689e-05,
520
+ "loss": 4.9722,
521
+ "step": 7300
522
+ },
523
+ {
524
+ "epoch": 0.25870507621311706,
525
+ "grad_norm": 2.7371039390563965,
526
+ "learning_rate": 3.706649419661586e-05,
527
+ "loss": 4.9584,
528
+ "step": 7400
529
+ },
530
+ {
531
+ "epoch": 0.2622010907565375,
532
+ "grad_norm": 2.891352891921997,
533
+ "learning_rate": 3.6891693469444835e-05,
534
+ "loss": 4.939,
535
+ "step": 7500
536
+ },
537
+ {
538
+ "epoch": 0.26569710529995805,
539
+ "grad_norm": 2.5609800815582275,
540
+ "learning_rate": 3.671689274227381e-05,
541
+ "loss": 4.9424,
542
+ "step": 7600
543
+ },
544
+ {
545
+ "epoch": 0.26919311984337857,
546
+ "grad_norm": 2.6541666984558105,
547
+ "learning_rate": 3.654209201510279e-05,
548
+ "loss": 4.9347,
549
+ "step": 7700
550
+ },
551
+ {
552
+ "epoch": 0.27268913438679904,
553
+ "grad_norm": 3.461524724960327,
554
+ "learning_rate": 3.636729128793176e-05,
555
+ "loss": 4.9917,
556
+ "step": 7800
557
+ },
558
+ {
559
+ "epoch": 0.27618514893021956,
560
+ "grad_norm": 2.6681628227233887,
561
+ "learning_rate": 3.6192490560760736e-05,
562
+ "loss": 4.9667,
563
+ "step": 7900
564
+ },
565
+ {
566
+ "epoch": 0.27968116347364,
567
+ "grad_norm": 2.8997554779052734,
568
+ "learning_rate": 3.601768983358971e-05,
569
+ "loss": 4.8909,
570
+ "step": 8000
571
+ },
572
+ {
573
+ "epoch": 0.28317717801706055,
574
+ "grad_norm": 2.552457094192505,
575
+ "learning_rate": 3.5842889106418684e-05,
576
+ "loss": 4.9577,
577
+ "step": 8100
578
+ },
579
+ {
580
+ "epoch": 0.28667319256048107,
581
+ "grad_norm": 2.6073288917541504,
582
+ "learning_rate": 3.566808837924766e-05,
583
+ "loss": 4.9378,
584
+ "step": 8200
585
+ },
586
+ {
587
+ "epoch": 0.29016920710390154,
588
+ "grad_norm": 2.604503870010376,
589
+ "learning_rate": 3.549328765207663e-05,
590
+ "loss": 4.9283,
591
+ "step": 8300
592
+ },
593
+ {
594
+ "epoch": 0.29366522164732206,
595
+ "grad_norm": 2.9705913066864014,
596
+ "learning_rate": 3.531848692490561e-05,
597
+ "loss": 4.9143,
598
+ "step": 8400
599
+ },
600
+ {
601
+ "epoch": 0.29716123619074253,
602
+ "grad_norm": 2.5234224796295166,
603
+ "learning_rate": 3.5143686197734585e-05,
604
+ "loss": 4.9231,
605
+ "step": 8500
606
+ },
607
+ {
608
+ "epoch": 0.30065725073416305,
609
+ "grad_norm": 2.8847386837005615,
610
+ "learning_rate": 3.496888547056356e-05,
611
+ "loss": 4.9186,
612
+ "step": 8600
613
+ },
614
+ {
615
+ "epoch": 0.3041532652775836,
616
+ "grad_norm": 3.093806505203247,
617
+ "learning_rate": 3.479408474339253e-05,
618
+ "loss": 4.895,
619
+ "step": 8700
620
+ },
621
+ {
622
+ "epoch": 0.30764927982100404,
623
+ "grad_norm": 3.003735303878784,
624
+ "learning_rate": 3.4619284016221506e-05,
625
+ "loss": 4.8982,
626
+ "step": 8800
627
+ },
628
+ {
629
+ "epoch": 0.31114529436442456,
630
+ "grad_norm": 2.972370147705078,
631
+ "learning_rate": 3.444448328905048e-05,
632
+ "loss": 4.9627,
633
+ "step": 8900
634
+ },
635
+ {
636
+ "epoch": 0.31464130890784503,
637
+ "grad_norm": 2.95778751373291,
638
+ "learning_rate": 3.426968256187946e-05,
639
+ "loss": 4.8926,
640
+ "step": 9000
641
+ },
642
+ {
643
+ "epoch": 0.31813732345126555,
644
+ "grad_norm": 2.7067129611968994,
645
+ "learning_rate": 3.4094881834708434e-05,
646
+ "loss": 4.9371,
647
+ "step": 9100
648
+ },
649
+ {
650
+ "epoch": 0.3216333379946861,
651
+ "grad_norm": 2.6828505992889404,
652
+ "learning_rate": 3.392008110753741e-05,
653
+ "loss": 4.9095,
654
+ "step": 9200
655
+ },
656
+ {
657
+ "epoch": 0.32512935253810654,
658
+ "grad_norm": 2.7085490226745605,
659
+ "learning_rate": 3.374528038036638e-05,
660
+ "loss": 4.895,
661
+ "step": 9300
662
+ },
663
+ {
664
+ "epoch": 0.32862536708152706,
665
+ "grad_norm": 2.601956605911255,
666
+ "learning_rate": 3.3570479653195355e-05,
667
+ "loss": 4.9177,
668
+ "step": 9400
669
+ },
670
+ {
671
+ "epoch": 0.3321213816249476,
672
+ "grad_norm": 2.70379900932312,
673
+ "learning_rate": 3.339567892602433e-05,
674
+ "loss": 4.9543,
675
+ "step": 9500
676
+ },
677
+ {
678
+ "epoch": 0.33561739616836805,
679
+ "grad_norm": 2.8263227939605713,
680
+ "learning_rate": 3.322087819885331e-05,
681
+ "loss": 4.9403,
682
+ "step": 9600
683
+ },
684
+ {
685
+ "epoch": 0.3391134107117886,
686
+ "grad_norm": 2.645223617553711,
687
+ "learning_rate": 3.304607747168228e-05,
688
+ "loss": 4.9174,
689
+ "step": 9700
690
+ },
691
+ {
692
+ "epoch": 0.34260942525520904,
693
+ "grad_norm": 3.0595059394836426,
694
+ "learning_rate": 3.2871276744511257e-05,
695
+ "loss": 4.9093,
696
+ "step": 9800
697
+ },
698
+ {
699
+ "epoch": 0.34610543979862957,
700
+ "grad_norm": 3.042128324508667,
701
+ "learning_rate": 3.269647601734023e-05,
702
+ "loss": 4.8524,
703
+ "step": 9900
704
+ },
705
+ {
706
+ "epoch": 0.3496014543420501,
707
+ "grad_norm": 2.5243098735809326,
708
+ "learning_rate": 3.2521675290169204e-05,
709
+ "loss": 4.8915,
710
+ "step": 10000
711
+ },
712
+ {
713
+ "epoch": 0.35309746888547056,
714
+ "grad_norm": 2.696924924850464,
715
+ "learning_rate": 3.2346874562998184e-05,
716
+ "loss": 4.8823,
717
+ "step": 10100
718
+ },
719
+ {
720
+ "epoch": 0.3565934834288911,
721
+ "grad_norm": 3.047484874725342,
722
+ "learning_rate": 3.217207383582716e-05,
723
+ "loss": 4.8717,
724
+ "step": 10200
725
+ },
726
+ {
727
+ "epoch": 0.36008949797231155,
728
+ "grad_norm": 2.573148250579834,
729
+ "learning_rate": 3.199727310865613e-05,
730
+ "loss": 4.9094,
731
+ "step": 10300
732
+ },
733
+ {
734
+ "epoch": 0.36358551251573207,
735
+ "grad_norm": 2.771249532699585,
736
+ "learning_rate": 3.1822472381485105e-05,
737
+ "loss": 4.8894,
738
+ "step": 10400
739
+ },
740
+ {
741
+ "epoch": 0.3670815270591526,
742
+ "grad_norm": 2.7111799716949463,
743
+ "learning_rate": 3.164767165431408e-05,
744
+ "loss": 4.8781,
745
+ "step": 10500
746
+ },
747
+ {
748
+ "epoch": 0.37057754160257306,
749
+ "grad_norm": 2.443671941757202,
750
+ "learning_rate": 3.147287092714306e-05,
751
+ "loss": 4.911,
752
+ "step": 10600
753
+ },
754
+ {
755
+ "epoch": 0.3740735561459936,
756
+ "grad_norm": 3.0202887058258057,
757
+ "learning_rate": 3.129807019997203e-05,
758
+ "loss": 4.8911,
759
+ "step": 10700
760
+ },
761
+ {
762
+ "epoch": 0.37756957068941405,
763
+ "grad_norm": 2.860243320465088,
764
+ "learning_rate": 3.112326947280101e-05,
765
+ "loss": 4.8759,
766
+ "step": 10800
767
+ },
768
+ {
769
+ "epoch": 0.38106558523283457,
770
+ "grad_norm": 2.7219772338867188,
771
+ "learning_rate": 3.094846874562998e-05,
772
+ "loss": 4.8958,
773
+ "step": 10900
774
+ },
775
+ {
776
+ "epoch": 0.3845615997762551,
777
+ "grad_norm": 2.7428410053253174,
778
+ "learning_rate": 3.0773668018458954e-05,
779
+ "loss": 4.8559,
780
+ "step": 11000
781
+ },
782
+ {
783
+ "epoch": 0.38805761431967556,
784
+ "grad_norm": 2.6882712841033936,
785
+ "learning_rate": 3.0598867291287935e-05,
786
+ "loss": 4.9264,
787
+ "step": 11100
788
+ },
789
+ {
790
+ "epoch": 0.3915536288630961,
791
+ "grad_norm": 2.3838911056518555,
792
+ "learning_rate": 3.042406656411691e-05,
793
+ "loss": 4.8896,
794
+ "step": 11200
795
+ },
796
+ {
797
+ "epoch": 0.39504964340651655,
798
+ "grad_norm": 2.679856538772583,
799
+ "learning_rate": 3.0249265836945882e-05,
800
+ "loss": 4.8963,
801
+ "step": 11300
802
+ },
803
+ {
804
+ "epoch": 0.39854565794993707,
805
+ "grad_norm": 2.4506402015686035,
806
+ "learning_rate": 3.007446510977486e-05,
807
+ "loss": 4.917,
808
+ "step": 11400
809
+ },
810
+ {
811
+ "epoch": 0.4020416724933576,
812
+ "grad_norm": 3.169481039047241,
813
+ "learning_rate": 2.9899664382603833e-05,
814
+ "loss": 4.8908,
815
+ "step": 11500
816
+ },
817
+ {
818
+ "epoch": 0.40553768703677806,
819
+ "grad_norm": 2.744379997253418,
820
+ "learning_rate": 2.9724863655432806e-05,
821
+ "loss": 4.8491,
822
+ "step": 11600
823
+ },
824
+ {
825
+ "epoch": 0.4090337015801986,
826
+ "grad_norm": 2.633491039276123,
827
+ "learning_rate": 2.9550062928261784e-05,
828
+ "loss": 4.8553,
829
+ "step": 11700
830
+ },
831
+ {
832
+ "epoch": 0.41252971612361905,
833
+ "grad_norm": 2.816256284713745,
834
+ "learning_rate": 2.9375262201090757e-05,
835
+ "loss": 4.8653,
836
+ "step": 11800
837
+ },
838
+ {
839
+ "epoch": 0.4160257306670396,
840
+ "grad_norm": 2.6040494441986084,
841
+ "learning_rate": 2.920046147391973e-05,
842
+ "loss": 4.8381,
843
+ "step": 11900
844
+ },
845
+ {
846
+ "epoch": 0.4195217452104601,
847
+ "grad_norm": 3.032259225845337,
848
+ "learning_rate": 2.9025660746748705e-05,
849
+ "loss": 4.912,
850
+ "step": 12000
851
+ },
852
+ {
853
+ "epoch": 0.42301775975388056,
854
+ "grad_norm": 3.134244203567505,
855
+ "learning_rate": 2.885086001957768e-05,
856
+ "loss": 4.8971,
857
+ "step": 12100
858
+ },
859
+ {
860
+ "epoch": 0.4265137742973011,
861
+ "grad_norm": 2.9555444717407227,
862
+ "learning_rate": 2.867605929240666e-05,
863
+ "loss": 4.8879,
864
+ "step": 12200
865
+ },
866
+ {
867
+ "epoch": 0.43000978884072155,
868
+ "grad_norm": 2.4772422313690186,
869
+ "learning_rate": 2.8501258565235632e-05,
870
+ "loss": 4.8624,
871
+ "step": 12300
872
+ },
873
+ {
874
+ "epoch": 0.4335058033841421,
875
+ "grad_norm": 2.8734424114227295,
876
+ "learning_rate": 2.8326457838064606e-05,
877
+ "loss": 4.8553,
878
+ "step": 12400
879
+ },
880
+ {
881
+ "epoch": 0.4370018179275626,
882
+ "grad_norm": 2.7253541946411133,
883
+ "learning_rate": 2.815165711089358e-05,
884
+ "loss": 4.8841,
885
+ "step": 12500
886
+ },
887
+ {
888
+ "epoch": 0.44049783247098306,
889
+ "grad_norm": 3.075535297393799,
890
+ "learning_rate": 2.7976856383722553e-05,
891
+ "loss": 4.8871,
892
+ "step": 12600
893
+ },
894
+ {
895
+ "epoch": 0.4439938470144036,
896
+ "grad_norm": 2.6050570011138916,
897
+ "learning_rate": 2.7802055656551534e-05,
898
+ "loss": 4.8382,
899
+ "step": 12700
900
+ },
901
+ {
902
+ "epoch": 0.44748986155782405,
903
+ "grad_norm": 2.6890673637390137,
904
+ "learning_rate": 2.7627254929380508e-05,
905
+ "loss": 4.9089,
906
+ "step": 12800
907
+ },
908
+ {
909
+ "epoch": 0.4509858761012446,
910
+ "grad_norm": 2.602142572402954,
911
+ "learning_rate": 2.745245420220948e-05,
912
+ "loss": 4.8587,
913
+ "step": 12900
914
+ },
915
+ {
916
+ "epoch": 0.4544818906446651,
917
+ "grad_norm": 2.8939311504364014,
918
+ "learning_rate": 2.7277653475038455e-05,
919
+ "loss": 4.8508,
920
+ "step": 13000
921
+ },
922
+ {
923
+ "epoch": 0.45797790518808557,
924
+ "grad_norm": 3.0450758934020996,
925
+ "learning_rate": 2.710285274786743e-05,
926
+ "loss": 4.8884,
927
+ "step": 13100
928
+ },
929
+ {
930
+ "epoch": 0.4614739197315061,
931
+ "grad_norm": 2.4321885108947754,
932
+ "learning_rate": 2.692805202069641e-05,
933
+ "loss": 4.8996,
934
+ "step": 13200
935
+ },
936
+ {
937
+ "epoch": 0.4649699342749266,
938
+ "grad_norm": 2.4929943084716797,
939
+ "learning_rate": 2.6753251293525383e-05,
940
+ "loss": 4.8747,
941
+ "step": 13300
942
+ },
943
+ {
944
+ "epoch": 0.4684659488183471,
945
+ "grad_norm": 2.8290367126464844,
946
+ "learning_rate": 2.6578450566354356e-05,
947
+ "loss": 4.9006,
948
+ "step": 13400
949
+ },
950
+ {
951
+ "epoch": 0.4719619633617676,
952
+ "grad_norm": 3.063918352127075,
953
+ "learning_rate": 2.640364983918333e-05,
954
+ "loss": 4.8664,
955
+ "step": 13500
956
+ },
957
+ {
958
+ "epoch": 0.47545797790518807,
959
+ "grad_norm": 3.0412003993988037,
960
+ "learning_rate": 2.6228849112012304e-05,
961
+ "loss": 4.8873,
962
+ "step": 13600
963
+ },
964
+ {
965
+ "epoch": 0.4789539924486086,
966
+ "grad_norm": 3.148043155670166,
967
+ "learning_rate": 2.6054048384841284e-05,
968
+ "loss": 4.8698,
969
+ "step": 13700
970
+ },
971
+ {
972
+ "epoch": 0.4824500069920291,
973
+ "grad_norm": 2.88493013381958,
974
+ "learning_rate": 2.5879247657670258e-05,
975
+ "loss": 4.8648,
976
+ "step": 13800
977
+ },
978
+ {
979
+ "epoch": 0.4859460215354496,
980
+ "grad_norm": 2.653207302093506,
981
+ "learning_rate": 2.570444693049923e-05,
982
+ "loss": 4.8199,
983
+ "step": 13900
984
+ },
985
+ {
986
+ "epoch": 0.4894420360788701,
987
+ "grad_norm": 2.7175774574279785,
988
+ "learning_rate": 2.5529646203328205e-05,
989
+ "loss": 4.8838,
990
+ "step": 14000
991
+ },
992
+ {
993
+ "epoch": 0.49293805062229057,
994
+ "grad_norm": 2.9639148712158203,
995
+ "learning_rate": 2.535484547615718e-05,
996
+ "loss": 4.881,
997
+ "step": 14100
998
+ },
999
+ {
1000
+ "epoch": 0.4964340651657111,
1001
+ "grad_norm": 2.8595130443573,
1002
+ "learning_rate": 2.518004474898616e-05,
1003
+ "loss": 4.8871,
1004
+ "step": 14200
1005
+ },
1006
+ {
1007
+ "epoch": 0.4999300797091316,
1008
+ "grad_norm": 2.4576876163482666,
1009
+ "learning_rate": 2.5005244021815133e-05,
1010
+ "loss": 4.8357,
1011
+ "step": 14300
1012
+ },
1013
+ {
1014
+ "epoch": 0.5034260942525521,
1015
+ "grad_norm": 2.7865824699401855,
1016
+ "learning_rate": 2.4830443294644107e-05,
1017
+ "loss": 4.8026,
1018
+ "step": 14400
1019
+ },
1020
+ {
1021
+ "epoch": 0.5069221087959725,
1022
+ "grad_norm": 2.741359233856201,
1023
+ "learning_rate": 2.465564256747308e-05,
1024
+ "loss": 4.8478,
1025
+ "step": 14500
1026
+ },
1027
+ {
1028
+ "epoch": 0.5104181233393931,
1029
+ "grad_norm": 2.35073184967041,
1030
+ "learning_rate": 2.4480841840302057e-05,
1031
+ "loss": 4.8495,
1032
+ "step": 14600
1033
+ },
1034
+ {
1035
+ "epoch": 0.5139141378828136,
1036
+ "grad_norm": 2.672268867492676,
1037
+ "learning_rate": 2.430604111313103e-05,
1038
+ "loss": 4.8815,
1039
+ "step": 14700
1040
+ },
1041
+ {
1042
+ "epoch": 0.5174101524262341,
1043
+ "grad_norm": 2.589656114578247,
1044
+ "learning_rate": 2.4131240385960008e-05,
1045
+ "loss": 4.8814,
1046
+ "step": 14800
1047
+ },
1048
+ {
1049
+ "epoch": 0.5209061669696546,
1050
+ "grad_norm": 3.2942378520965576,
1051
+ "learning_rate": 2.3956439658788982e-05,
1052
+ "loss": 4.8035,
1053
+ "step": 14900
1054
+ },
1055
+ {
1056
+ "epoch": 0.524402181513075,
1057
+ "grad_norm": 2.683692216873169,
1058
+ "learning_rate": 2.3781638931617956e-05,
1059
+ "loss": 4.8741,
1060
+ "step": 15000
1061
+ },
1062
+ {
1063
+ "epoch": 0.5278981960564956,
1064
+ "grad_norm": 2.480876922607422,
1065
+ "learning_rate": 2.3606838204446933e-05,
1066
+ "loss": 4.8208,
1067
+ "step": 15100
1068
+ },
1069
+ {
1070
+ "epoch": 0.5313942105999161,
1071
+ "grad_norm": 3.073580503463745,
1072
+ "learning_rate": 2.3432037477275906e-05,
1073
+ "loss": 4.8279,
1074
+ "step": 15200
1075
+ },
1076
+ {
1077
+ "epoch": 0.5348902251433366,
1078
+ "grad_norm": 3.04691219329834,
1079
+ "learning_rate": 2.3257236750104883e-05,
1080
+ "loss": 4.8546,
1081
+ "step": 15300
1082
+ },
1083
+ {
1084
+ "epoch": 0.5383862396867571,
1085
+ "grad_norm": 3.1862494945526123,
1086
+ "learning_rate": 2.3082436022933857e-05,
1087
+ "loss": 4.8761,
1088
+ "step": 15400
1089
+ },
1090
+ {
1091
+ "epoch": 0.5418822542301776,
1092
+ "grad_norm": 3.067617654800415,
1093
+ "learning_rate": 2.290763529576283e-05,
1094
+ "loss": 4.8721,
1095
+ "step": 15500
1096
+ },
1097
+ {
1098
+ "epoch": 0.5453782687735981,
1099
+ "grad_norm": 3.071998119354248,
1100
+ "learning_rate": 2.2732834568591808e-05,
1101
+ "loss": 4.8404,
1102
+ "step": 15600
1103
+ },
1104
+ {
1105
+ "epoch": 0.5488742833170186,
1106
+ "grad_norm": 3.0393383502960205,
1107
+ "learning_rate": 2.255803384142078e-05,
1108
+ "loss": 4.8632,
1109
+ "step": 15700
1110
+ },
1111
+ {
1112
+ "epoch": 0.5523702978604391,
1113
+ "grad_norm": 2.7928342819213867,
1114
+ "learning_rate": 2.238323311424976e-05,
1115
+ "loss": 4.8156,
1116
+ "step": 15800
1117
+ },
1118
+ {
1119
+ "epoch": 0.5558663124038596,
1120
+ "grad_norm": 2.616582155227661,
1121
+ "learning_rate": 2.2208432387078732e-05,
1122
+ "loss": 4.8612,
1123
+ "step": 15900
1124
+ },
1125
+ {
1126
+ "epoch": 0.55936232694728,
1127
+ "grad_norm": 2.9017162322998047,
1128
+ "learning_rate": 2.2033631659907706e-05,
1129
+ "loss": 4.8416,
1130
+ "step": 16000
1131
+ },
1132
+ {
1133
+ "epoch": 0.5628583414907006,
1134
+ "grad_norm": 3.112460136413574,
1135
+ "learning_rate": 2.1858830932736683e-05,
1136
+ "loss": 4.8362,
1137
+ "step": 16100
1138
+ },
1139
+ {
1140
+ "epoch": 0.5663543560341211,
1141
+ "grad_norm": 2.453512668609619,
1142
+ "learning_rate": 2.1684030205565657e-05,
1143
+ "loss": 4.8558,
1144
+ "step": 16200
1145
+ },
1146
+ {
1147
+ "epoch": 0.5698503705775416,
1148
+ "grad_norm": 2.6998403072357178,
1149
+ "learning_rate": 2.1509229478394634e-05,
1150
+ "loss": 4.8444,
1151
+ "step": 16300
1152
+ },
1153
+ {
1154
+ "epoch": 0.5733463851209621,
1155
+ "grad_norm": 3.0294365882873535,
1156
+ "learning_rate": 2.1334428751223607e-05,
1157
+ "loss": 4.8883,
1158
+ "step": 16400
1159
+ },
1160
+ {
1161
+ "epoch": 0.5768423996643826,
1162
+ "grad_norm": 3.008953809738159,
1163
+ "learning_rate": 2.115962802405258e-05,
1164
+ "loss": 4.8333,
1165
+ "step": 16500
1166
+ },
1167
+ {
1168
+ "epoch": 0.5803384142078031,
1169
+ "grad_norm": 3.1423451900482178,
1170
+ "learning_rate": 2.0984827296881558e-05,
1171
+ "loss": 4.8312,
1172
+ "step": 16600
1173
+ },
1174
+ {
1175
+ "epoch": 0.5838344287512236,
1176
+ "grad_norm": 2.988421678543091,
1177
+ "learning_rate": 2.0810026569710532e-05,
1178
+ "loss": 4.8608,
1179
+ "step": 16700
1180
+ },
1181
+ {
1182
+ "epoch": 0.5873304432946441,
1183
+ "grad_norm": 2.9294745922088623,
1184
+ "learning_rate": 2.063522584253951e-05,
1185
+ "loss": 4.8528,
1186
+ "step": 16800
1187
+ },
1188
+ {
1189
+ "epoch": 0.5908264578380646,
1190
+ "grad_norm": 2.831146478652954,
1191
+ "learning_rate": 2.0460425115368483e-05,
1192
+ "loss": 4.8472,
1193
+ "step": 16900
1194
+ },
1195
+ {
1196
+ "epoch": 0.5943224723814851,
1197
+ "grad_norm": 2.5871362686157227,
1198
+ "learning_rate": 2.0285624388197456e-05,
1199
+ "loss": 4.8068,
1200
+ "step": 17000
1201
+ },
1202
+ {
1203
+ "epoch": 0.5978184869249056,
1204
+ "grad_norm": 2.6208789348602295,
1205
+ "learning_rate": 2.0110823661026433e-05,
1206
+ "loss": 4.8573,
1207
+ "step": 17100
1208
+ },
1209
+ {
1210
+ "epoch": 0.6013145014683261,
1211
+ "grad_norm": 3.1844122409820557,
1212
+ "learning_rate": 1.9936022933855407e-05,
1213
+ "loss": 4.8837,
1214
+ "step": 17200
1215
+ },
1216
+ {
1217
+ "epoch": 0.6048105160117466,
1218
+ "grad_norm": 2.6045706272125244,
1219
+ "learning_rate": 1.976122220668438e-05,
1220
+ "loss": 4.8873,
1221
+ "step": 17300
1222
+ },
1223
+ {
1224
+ "epoch": 0.6083065305551671,
1225
+ "grad_norm": 2.6569807529449463,
1226
+ "learning_rate": 1.9586421479513354e-05,
1227
+ "loss": 4.8533,
1228
+ "step": 17400
1229
+ },
1230
+ {
1231
+ "epoch": 0.6118025450985876,
1232
+ "grad_norm": 2.7514195442199707,
1233
+ "learning_rate": 1.941162075234233e-05,
1234
+ "loss": 4.8454,
1235
+ "step": 17500
1236
+ },
1237
+ {
1238
+ "epoch": 0.6152985596420081,
1239
+ "grad_norm": 3.129321575164795,
1240
+ "learning_rate": 1.9236820025171305e-05,
1241
+ "loss": 4.8622,
1242
+ "step": 17600
1243
+ },
1244
+ {
1245
+ "epoch": 0.6187945741854286,
1246
+ "grad_norm": 2.610549211502075,
1247
+ "learning_rate": 1.906201929800028e-05,
1248
+ "loss": 4.7976,
1249
+ "step": 17700
1250
+ },
1251
+ {
1252
+ "epoch": 0.6222905887288491,
1253
+ "grad_norm": 2.735844135284424,
1254
+ "learning_rate": 1.8887218570829256e-05,
1255
+ "loss": 4.8136,
1256
+ "step": 17800
1257
+ },
1258
+ {
1259
+ "epoch": 0.6257866032722696,
1260
+ "grad_norm": 3.020894765853882,
1261
+ "learning_rate": 1.871241784365823e-05,
1262
+ "loss": 4.8348,
1263
+ "step": 17900
1264
+ },
1265
+ {
1266
+ "epoch": 0.6292826178156901,
1267
+ "grad_norm": 3.2946112155914307,
1268
+ "learning_rate": 1.8537617116487203e-05,
1269
+ "loss": 4.8435,
1270
+ "step": 18000
1271
+ },
1272
+ {
1273
+ "epoch": 0.6327786323591106,
1274
+ "grad_norm": 2.6459012031555176,
1275
+ "learning_rate": 1.836281638931618e-05,
1276
+ "loss": 4.8705,
1277
+ "step": 18100
1278
+ },
1279
+ {
1280
+ "epoch": 0.6362746469025311,
1281
+ "grad_norm": 3.0113513469696045,
1282
+ "learning_rate": 1.8188015662145154e-05,
1283
+ "loss": 4.8347,
1284
+ "step": 18200
1285
+ },
1286
+ {
1287
+ "epoch": 0.6397706614459516,
1288
+ "grad_norm": 2.8123104572296143,
1289
+ "learning_rate": 1.8013214934974128e-05,
1290
+ "loss": 4.8568,
1291
+ "step": 18300
1292
+ },
1293
+ {
1294
+ "epoch": 0.6432666759893721,
1295
+ "grad_norm": 2.8283095359802246,
1296
+ "learning_rate": 1.7838414207803105e-05,
1297
+ "loss": 4.8248,
1298
+ "step": 18400
1299
+ },
1300
+ {
1301
+ "epoch": 0.6467626905327926,
1302
+ "grad_norm": 3.098484516143799,
1303
+ "learning_rate": 1.766361348063208e-05,
1304
+ "loss": 4.825,
1305
+ "step": 18500
1306
+ },
1307
+ {
1308
+ "epoch": 0.6502587050762131,
1309
+ "grad_norm": 2.566540002822876,
1310
+ "learning_rate": 1.7488812753461055e-05,
1311
+ "loss": 4.7884,
1312
+ "step": 18600
1313
+ },
1314
+ {
1315
+ "epoch": 0.6537547196196336,
1316
+ "grad_norm": 2.4459304809570312,
1317
+ "learning_rate": 1.731401202629003e-05,
1318
+ "loss": 4.8451,
1319
+ "step": 18700
1320
+ },
1321
+ {
1322
+ "epoch": 0.6572507341630541,
1323
+ "grad_norm": 3.134326457977295,
1324
+ "learning_rate": 1.7139211299119003e-05,
1325
+ "loss": 4.8586,
1326
+ "step": 18800
1327
+ },
1328
+ {
1329
+ "epoch": 0.6607467487064747,
1330
+ "grad_norm": 2.64436674118042,
1331
+ "learning_rate": 1.696441057194798e-05,
1332
+ "loss": 4.8463,
1333
+ "step": 18900
1334
+ },
1335
+ {
1336
+ "epoch": 0.6642427632498952,
1337
+ "grad_norm": 2.925896167755127,
1338
+ "learning_rate": 1.6789609844776954e-05,
1339
+ "loss": 4.8216,
1340
+ "step": 19000
1341
+ },
1342
+ {
1343
+ "epoch": 0.6677387777933156,
1344
+ "grad_norm": 2.873636245727539,
1345
+ "learning_rate": 1.661480911760593e-05,
1346
+ "loss": 4.8579,
1347
+ "step": 19100
1348
+ },
1349
+ {
1350
+ "epoch": 0.6712347923367361,
1351
+ "grad_norm": 2.8140294551849365,
1352
+ "learning_rate": 1.6440008390434904e-05,
1353
+ "loss": 4.8282,
1354
+ "step": 19200
1355
+ },
1356
+ {
1357
+ "epoch": 0.6747308068801566,
1358
+ "grad_norm": 2.66567325592041,
1359
+ "learning_rate": 1.6265207663263878e-05,
1360
+ "loss": 4.8042,
1361
+ "step": 19300
1362
+ },
1363
+ {
1364
+ "epoch": 0.6782268214235772,
1365
+ "grad_norm": 2.895731210708618,
1366
+ "learning_rate": 1.6090406936092855e-05,
1367
+ "loss": 4.81,
1368
+ "step": 19400
1369
+ },
1370
+ {
1371
+ "epoch": 0.6817228359669977,
1372
+ "grad_norm": 2.673616886138916,
1373
+ "learning_rate": 1.591560620892183e-05,
1374
+ "loss": 4.8267,
1375
+ "step": 19500
1376
+ },
1377
+ {
1378
+ "epoch": 0.6852188505104181,
1379
+ "grad_norm": 2.8910722732543945,
1380
+ "learning_rate": 1.5740805481750806e-05,
1381
+ "loss": 4.8449,
1382
+ "step": 19600
1383
+ },
1384
+ {
1385
+ "epoch": 0.6887148650538386,
1386
+ "grad_norm": 3.0458858013153076,
1387
+ "learning_rate": 1.556600475457978e-05,
1388
+ "loss": 4.8022,
1389
+ "step": 19700
1390
+ },
1391
+ {
1392
+ "epoch": 0.6922108795972591,
1393
+ "grad_norm": 2.440716028213501,
1394
+ "learning_rate": 1.5391204027408753e-05,
1395
+ "loss": 4.8125,
1396
+ "step": 19800
1397
+ },
1398
+ {
1399
+ "epoch": 0.6957068941406797,
1400
+ "grad_norm": 2.811601161956787,
1401
+ "learning_rate": 1.521640330023773e-05,
1402
+ "loss": 4.8567,
1403
+ "step": 19900
1404
+ },
1405
+ {
1406
+ "epoch": 0.6992029086841002,
1407
+ "grad_norm": 2.6201772689819336,
1408
+ "learning_rate": 1.5041602573066704e-05,
1409
+ "loss": 4.8375,
1410
+ "step": 20000
1411
+ },
1412
+ {
1413
+ "epoch": 0.7026989232275206,
1414
+ "grad_norm": 3.1684257984161377,
1415
+ "learning_rate": 1.4866801845895681e-05,
1416
+ "loss": 4.8707,
1417
+ "step": 20100
1418
+ },
1419
+ {
1420
+ "epoch": 0.7061949377709411,
1421
+ "grad_norm": 2.945105791091919,
1422
+ "learning_rate": 1.4692001118724655e-05,
1423
+ "loss": 4.8065,
1424
+ "step": 20200
1425
+ },
1426
+ {
1427
+ "epoch": 0.7096909523143616,
1428
+ "grad_norm": 2.767002820968628,
1429
+ "learning_rate": 1.4517200391553628e-05,
1430
+ "loss": 4.8594,
1431
+ "step": 20300
1432
+ },
1433
+ {
1434
+ "epoch": 0.7131869668577822,
1435
+ "grad_norm": 2.928342819213867,
1436
+ "learning_rate": 1.4342399664382605e-05,
1437
+ "loss": 4.8418,
1438
+ "step": 20400
1439
+ },
1440
+ {
1441
+ "epoch": 0.7166829814012027,
1442
+ "grad_norm": 2.7073936462402344,
1443
+ "learning_rate": 1.4167598937211579e-05,
1444
+ "loss": 4.8515,
1445
+ "step": 20500
1446
+ },
1447
+ {
1448
+ "epoch": 0.7201789959446231,
1449
+ "grad_norm": 3.105440616607666,
1450
+ "learning_rate": 1.3992798210040556e-05,
1451
+ "loss": 4.8075,
1452
+ "step": 20600
1453
+ },
1454
+ {
1455
+ "epoch": 0.7236750104880436,
1456
+ "grad_norm": 2.8033523559570312,
1457
+ "learning_rate": 1.381799748286953e-05,
1458
+ "loss": 4.8108,
1459
+ "step": 20700
1460
+ },
1461
+ {
1462
+ "epoch": 0.7271710250314641,
1463
+ "grad_norm": 2.9374163150787354,
1464
+ "learning_rate": 1.3643196755698503e-05,
1465
+ "loss": 4.8197,
1466
+ "step": 20800
1467
+ },
1468
+ {
1469
+ "epoch": 0.7306670395748847,
1470
+ "grad_norm": 2.5617268085479736,
1471
+ "learning_rate": 1.346839602852748e-05,
1472
+ "loss": 4.8105,
1473
+ "step": 20900
1474
+ },
1475
+ {
1476
+ "epoch": 0.7341630541183052,
1477
+ "grad_norm": 2.8728411197662354,
1478
+ "learning_rate": 1.3293595301356454e-05,
1479
+ "loss": 4.8447,
1480
+ "step": 21000
1481
+ },
1482
+ {
1483
+ "epoch": 0.7376590686617256,
1484
+ "grad_norm": 2.78499436378479,
1485
+ "learning_rate": 1.3118794574185428e-05,
1486
+ "loss": 4.79,
1487
+ "step": 21100
1488
+ },
1489
+ {
1490
+ "epoch": 0.7411550832051461,
1491
+ "grad_norm": 2.84222412109375,
1492
+ "learning_rate": 1.2943993847014405e-05,
1493
+ "loss": 4.8189,
1494
+ "step": 21200
1495
+ },
1496
+ {
1497
+ "epoch": 0.7446510977485666,
1498
+ "grad_norm": 2.6608426570892334,
1499
+ "learning_rate": 1.2769193119843379e-05,
1500
+ "loss": 4.7787,
1501
+ "step": 21300
1502
+ },
1503
+ {
1504
+ "epoch": 0.7481471122919872,
1505
+ "grad_norm": 2.662895917892456,
1506
+ "learning_rate": 1.2594392392672356e-05,
1507
+ "loss": 4.8287,
1508
+ "step": 21400
1509
+ },
1510
+ {
1511
+ "epoch": 0.7516431268354077,
1512
+ "grad_norm": 2.7515127658843994,
1513
+ "learning_rate": 1.241959166550133e-05,
1514
+ "loss": 4.7988,
1515
+ "step": 21500
1516
+ },
1517
+ {
1518
+ "epoch": 0.7551391413788281,
1519
+ "grad_norm": 2.9696640968322754,
1520
+ "learning_rate": 1.2244790938330305e-05,
1521
+ "loss": 4.82,
1522
+ "step": 21600
1523
+ },
1524
+ {
1525
+ "epoch": 0.7586351559222486,
1526
+ "grad_norm": 3.002441644668579,
1527
+ "learning_rate": 1.206999021115928e-05,
1528
+ "loss": 4.7711,
1529
+ "step": 21700
1530
+ },
1531
+ {
1532
+ "epoch": 0.7621311704656691,
1533
+ "grad_norm": 3.171853542327881,
1534
+ "learning_rate": 1.1895189483988254e-05,
1535
+ "loss": 4.8068,
1536
+ "step": 21800
1537
+ },
1538
+ {
1539
+ "epoch": 0.7656271850090897,
1540
+ "grad_norm": 2.604813575744629,
1541
+ "learning_rate": 1.172038875681723e-05,
1542
+ "loss": 4.842,
1543
+ "step": 21900
1544
+ },
1545
+ {
1546
+ "epoch": 0.7691231995525102,
1547
+ "grad_norm": 3.086451768875122,
1548
+ "learning_rate": 1.1545588029646205e-05,
1549
+ "loss": 4.8064,
1550
+ "step": 22000
1551
+ },
1552
+ {
1553
+ "epoch": 0.7726192140959306,
1554
+ "grad_norm": 3.1282565593719482,
1555
+ "learning_rate": 1.1370787302475178e-05,
1556
+ "loss": 4.827,
1557
+ "step": 22100
1558
+ },
1559
+ {
1560
+ "epoch": 0.7761152286393511,
1561
+ "grad_norm": 2.6735763549804688,
1562
+ "learning_rate": 1.1195986575304154e-05,
1563
+ "loss": 4.792,
1564
+ "step": 22200
1565
+ },
1566
+ {
1567
+ "epoch": 0.7796112431827716,
1568
+ "grad_norm": 2.88252329826355,
1569
+ "learning_rate": 1.1021185848133129e-05,
1570
+ "loss": 4.8025,
1571
+ "step": 22300
1572
+ },
1573
+ {
1574
+ "epoch": 0.7831072577261922,
1575
+ "grad_norm": 3.1057016849517822,
1576
+ "learning_rate": 1.0846385120962103e-05,
1577
+ "loss": 4.8332,
1578
+ "step": 22400
1579
+ },
1580
+ {
1581
+ "epoch": 0.7866032722696127,
1582
+ "grad_norm": 2.930310010910034,
1583
+ "learning_rate": 1.0671584393791078e-05,
1584
+ "loss": 4.8156,
1585
+ "step": 22500
1586
+ },
1587
+ {
1588
+ "epoch": 0.7900992868130331,
1589
+ "grad_norm": 2.785041093826294,
1590
+ "learning_rate": 1.0496783666620053e-05,
1591
+ "loss": 4.8164,
1592
+ "step": 22600
1593
+ },
1594
+ {
1595
+ "epoch": 0.7935953013564536,
1596
+ "grad_norm": 3.089155912399292,
1597
+ "learning_rate": 1.0321982939449029e-05,
1598
+ "loss": 4.8126,
1599
+ "step": 22700
1600
+ },
1601
+ {
1602
+ "epoch": 0.7970913158998741,
1603
+ "grad_norm": 2.8560636043548584,
1604
+ "learning_rate": 1.0147182212278002e-05,
1605
+ "loss": 4.8162,
1606
+ "step": 22800
1607
+ },
1608
+ {
1609
+ "epoch": 0.8005873304432947,
1610
+ "grad_norm": 2.9523401260375977,
1611
+ "learning_rate": 9.972381485106978e-06,
1612
+ "loss": 4.8756,
1613
+ "step": 22900
1614
+ },
1615
+ {
1616
+ "epoch": 0.8040833449867152,
1617
+ "grad_norm": 2.6021039485931396,
1618
+ "learning_rate": 9.797580757935953e-06,
1619
+ "loss": 4.8416,
1620
+ "step": 23000
1621
+ },
1622
+ {
1623
+ "epoch": 0.8075793595301356,
1624
+ "grad_norm": 2.580181121826172,
1625
+ "learning_rate": 9.622780030764929e-06,
1626
+ "loss": 4.8208,
1627
+ "step": 23100
1628
+ },
1629
+ {
1630
+ "epoch": 0.8110753740735561,
1631
+ "grad_norm": 2.7473840713500977,
1632
+ "learning_rate": 9.447979303593904e-06,
1633
+ "loss": 4.8074,
1634
+ "step": 23200
1635
+ },
1636
+ {
1637
+ "epoch": 0.8145713886169766,
1638
+ "grad_norm": 2.52461314201355,
1639
+ "learning_rate": 9.273178576422878e-06,
1640
+ "loss": 4.8019,
1641
+ "step": 23300
1642
+ },
1643
+ {
1644
+ "epoch": 0.8180674031603972,
1645
+ "grad_norm": 3.13628888130188,
1646
+ "learning_rate": 9.098377849251853e-06,
1647
+ "loss": 4.8007,
1648
+ "step": 23400
1649
+ },
1650
+ {
1651
+ "epoch": 0.8215634177038177,
1652
+ "grad_norm": 2.702481746673584,
1653
+ "learning_rate": 8.923577122080828e-06,
1654
+ "loss": 4.8039,
1655
+ "step": 23500
1656
+ },
1657
+ {
1658
+ "epoch": 0.8250594322472381,
1659
+ "grad_norm": 3.0160887241363525,
1660
+ "learning_rate": 8.748776394909804e-06,
1661
+ "loss": 4.8218,
1662
+ "step": 23600
1663
+ },
1664
+ {
1665
+ "epoch": 0.8285554467906586,
1666
+ "grad_norm": 3.1599791049957275,
1667
+ "learning_rate": 8.573975667738777e-06,
1668
+ "loss": 4.7874,
1669
+ "step": 23700
1670
+ },
1671
+ {
1672
+ "epoch": 0.8320514613340791,
1673
+ "grad_norm": 2.9699933528900146,
1674
+ "learning_rate": 8.399174940567753e-06,
1675
+ "loss": 4.8316,
1676
+ "step": 23800
1677
+ },
1678
+ {
1679
+ "epoch": 0.8355474758774997,
1680
+ "grad_norm": 2.7698373794555664,
1681
+ "learning_rate": 8.224374213396728e-06,
1682
+ "loss": 4.838,
1683
+ "step": 23900
1684
+ },
1685
+ {
1686
+ "epoch": 0.8390434904209202,
1687
+ "grad_norm": 2.6946427822113037,
1688
+ "learning_rate": 8.049573486225704e-06,
1689
+ "loss": 4.7893,
1690
+ "step": 24000
1691
+ },
1692
+ {
1693
+ "epoch": 0.8425395049643406,
1694
+ "grad_norm": 3.0160024166107178,
1695
+ "learning_rate": 7.874772759054679e-06,
1696
+ "loss": 4.8058,
1697
+ "step": 24100
1698
+ },
1699
+ {
1700
+ "epoch": 0.8460355195077611,
1701
+ "grad_norm": 2.491325855255127,
1702
+ "learning_rate": 7.699972031883653e-06,
1703
+ "loss": 4.7965,
1704
+ "step": 24200
1705
+ },
1706
+ {
1707
+ "epoch": 0.8495315340511816,
1708
+ "grad_norm": 3.0036044120788574,
1709
+ "learning_rate": 7.525171304712628e-06,
1710
+ "loss": 4.7869,
1711
+ "step": 24300
1712
+ },
1713
+ {
1714
+ "epoch": 0.8530275485946022,
1715
+ "grad_norm": 3.335988759994507,
1716
+ "learning_rate": 7.350370577541603e-06,
1717
+ "loss": 4.8458,
1718
+ "step": 24400
1719
+ },
1720
+ {
1721
+ "epoch": 0.8565235631380227,
1722
+ "grad_norm": 2.7462995052337646,
1723
+ "learning_rate": 7.175569850370578e-06,
1724
+ "loss": 4.7888,
1725
+ "step": 24500
1726
+ },
1727
+ {
1728
+ "epoch": 0.8600195776814431,
1729
+ "grad_norm": 2.402048110961914,
1730
+ "learning_rate": 7.000769123199553e-06,
1731
+ "loss": 4.7661,
1732
+ "step": 24600
1733
+ },
1734
+ {
1735
+ "epoch": 0.8635155922248636,
1736
+ "grad_norm": 2.752999782562256,
1737
+ "learning_rate": 6.825968396028527e-06,
1738
+ "loss": 4.8055,
1739
+ "step": 24700
1740
+ },
1741
+ {
1742
+ "epoch": 0.8670116067682841,
1743
+ "grad_norm": 2.9346210956573486,
1744
+ "learning_rate": 6.651167668857502e-06,
1745
+ "loss": 4.7676,
1746
+ "step": 24800
1747
+ },
1748
+ {
1749
+ "epoch": 0.8705076213117047,
1750
+ "grad_norm": 2.790853261947632,
1751
+ "learning_rate": 6.476366941686478e-06,
1752
+ "loss": 4.7821,
1753
+ "step": 24900
1754
+ },
1755
+ {
1756
+ "epoch": 0.8740036358551252,
1757
+ "grad_norm": 2.6606948375701904,
1758
+ "learning_rate": 6.301566214515453e-06,
1759
+ "loss": 4.7845,
1760
+ "step": 25000
1761
+ },
1762
+ {
1763
+ "epoch": 0.8774996503985456,
1764
+ "grad_norm": 2.6995749473571777,
1765
+ "learning_rate": 6.1267654873444275e-06,
1766
+ "loss": 4.8264,
1767
+ "step": 25100
1768
+ },
1769
+ {
1770
+ "epoch": 0.8809956649419661,
1771
+ "grad_norm": 2.790954351425171,
1772
+ "learning_rate": 5.951964760173403e-06,
1773
+ "loss": 4.8493,
1774
+ "step": 25200
1775
+ },
1776
+ {
1777
+ "epoch": 0.8844916794853867,
1778
+ "grad_norm": 2.903285264968872,
1779
+ "learning_rate": 5.777164033002377e-06,
1780
+ "loss": 4.8131,
1781
+ "step": 25300
1782
+ },
1783
+ {
1784
+ "epoch": 0.8879876940288072,
1785
+ "grad_norm": 2.517873525619507,
1786
+ "learning_rate": 5.602363305831353e-06,
1787
+ "loss": 4.7983,
1788
+ "step": 25400
1789
+ },
1790
+ {
1791
+ "epoch": 0.8914837085722277,
1792
+ "grad_norm": 2.7883822917938232,
1793
+ "learning_rate": 5.427562578660327e-06,
1794
+ "loss": 4.778,
1795
+ "step": 25500
1796
+ },
1797
+ {
1798
+ "epoch": 0.8949797231156481,
1799
+ "grad_norm": 3.072084426879883,
1800
+ "learning_rate": 5.252761851489303e-06,
1801
+ "loss": 4.8017,
1802
+ "step": 25600
1803
+ },
1804
+ {
1805
+ "epoch": 0.8984757376590686,
1806
+ "grad_norm": 2.7740066051483154,
1807
+ "learning_rate": 5.077961124318278e-06,
1808
+ "loss": 4.8314,
1809
+ "step": 25700
1810
+ },
1811
+ {
1812
+ "epoch": 0.9019717522024892,
1813
+ "grad_norm": 2.706472873687744,
1814
+ "learning_rate": 4.903160397147253e-06,
1815
+ "loss": 4.7765,
1816
+ "step": 25800
1817
+ },
1818
+ {
1819
+ "epoch": 0.9054677667459097,
1820
+ "grad_norm": 3.1904239654541016,
1821
+ "learning_rate": 4.728359669976228e-06,
1822
+ "loss": 4.8255,
1823
+ "step": 25900
1824
+ },
1825
+ {
1826
+ "epoch": 0.9089637812893302,
1827
+ "grad_norm": 2.787018299102783,
1828
+ "learning_rate": 4.5535589428052025e-06,
1829
+ "loss": 4.8399,
1830
+ "step": 26000
1831
+ },
1832
+ {
1833
+ "epoch": 0.9124597958327507,
1834
+ "grad_norm": 2.6253585815429688,
1835
+ "learning_rate": 4.378758215634177e-06,
1836
+ "loss": 4.8217,
1837
+ "step": 26100
1838
+ },
1839
+ {
1840
+ "epoch": 0.9159558103761711,
1841
+ "grad_norm": 2.7230615615844727,
1842
+ "learning_rate": 4.2039574884631515e-06,
1843
+ "loss": 4.7865,
1844
+ "step": 26200
1845
+ },
1846
+ {
1847
+ "epoch": 0.9194518249195917,
1848
+ "grad_norm": 3.5472490787506104,
1849
+ "learning_rate": 4.029156761292127e-06,
1850
+ "loss": 4.7454,
1851
+ "step": 26300
1852
+ },
1853
+ {
1854
+ "epoch": 0.9229478394630122,
1855
+ "grad_norm": 2.610646963119507,
1856
+ "learning_rate": 3.854356034121102e-06,
1857
+ "loss": 4.7943,
1858
+ "step": 26400
1859
+ },
1860
+ {
1861
+ "epoch": 0.9264438540064327,
1862
+ "grad_norm": 2.580254316329956,
1863
+ "learning_rate": 3.679555306950077e-06,
1864
+ "loss": 4.8029,
1865
+ "step": 26500
1866
+ },
1867
+ {
1868
+ "epoch": 0.9299398685498532,
1869
+ "grad_norm": 2.8292527198791504,
1870
+ "learning_rate": 3.504754579779052e-06,
1871
+ "loss": 4.8067,
1872
+ "step": 26600
1873
+ },
1874
+ {
1875
+ "epoch": 0.9334358830932736,
1876
+ "grad_norm": 2.95322322845459,
1877
+ "learning_rate": 3.3299538526080267e-06,
1878
+ "loss": 4.7623,
1879
+ "step": 26700
1880
+ },
1881
+ {
1882
+ "epoch": 0.9369318976366942,
1883
+ "grad_norm": 2.604645013809204,
1884
+ "learning_rate": 3.155153125437002e-06,
1885
+ "loss": 4.8072,
1886
+ "step": 26800
1887
+ },
1888
+ {
1889
+ "epoch": 0.9404279121801147,
1890
+ "grad_norm": 2.9846179485321045,
1891
+ "learning_rate": 2.980352398265977e-06,
1892
+ "loss": 4.7879,
1893
+ "step": 26900
1894
+ },
1895
+ {
1896
+ "epoch": 0.9439239267235352,
1897
+ "grad_norm": 2.7782154083251953,
1898
+ "learning_rate": 2.805551671094952e-06,
1899
+ "loss": 4.7872,
1900
+ "step": 27000
1901
+ },
1902
+ {
1903
+ "epoch": 0.9474199412669557,
1904
+ "grad_norm": 3.231360673904419,
1905
+ "learning_rate": 2.630750943923927e-06,
1906
+ "loss": 4.8513,
1907
+ "step": 27100
1908
+ },
1909
+ {
1910
+ "epoch": 0.9509159558103761,
1911
+ "grad_norm": 2.78902006149292,
1912
+ "learning_rate": 2.455950216752902e-06,
1913
+ "loss": 4.8163,
1914
+ "step": 27200
1915
+ },
1916
+ {
1917
+ "epoch": 0.9544119703537967,
1918
+ "grad_norm": 2.885953903198242,
1919
+ "learning_rate": 2.281149489581877e-06,
1920
+ "loss": 4.7673,
1921
+ "step": 27300
1922
+ },
1923
+ {
1924
+ "epoch": 0.9579079848972172,
1925
+ "grad_norm": 3.1494991779327393,
1926
+ "learning_rate": 2.1063487624108518e-06,
1927
+ "loss": 4.7808,
1928
+ "step": 27400
1929
+ },
1930
+ {
1931
+ "epoch": 0.9614039994406377,
1932
+ "grad_norm": 2.541231870651245,
1933
+ "learning_rate": 1.9315480352398267e-06,
1934
+ "loss": 4.8166,
1935
+ "step": 27500
1936
+ },
1937
+ {
1938
+ "epoch": 0.9649000139840582,
1939
+ "grad_norm": 2.773287773132324,
1940
+ "learning_rate": 1.7567473080688017e-06,
1941
+ "loss": 4.7576,
1942
+ "step": 27600
1943
+ },
1944
+ {
1945
+ "epoch": 0.9683960285274786,
1946
+ "grad_norm": 2.559238910675049,
1947
+ "learning_rate": 1.5819465808977766e-06,
1948
+ "loss": 4.7819,
1949
+ "step": 27700
1950
+ },
1951
+ {
1952
+ "epoch": 0.9718920430708992,
1953
+ "grad_norm": 2.684382200241089,
1954
+ "learning_rate": 1.4071458537267516e-06,
1955
+ "loss": 4.783,
1956
+ "step": 27800
1957
+ },
1958
+ {
1959
+ "epoch": 0.9753880576143197,
1960
+ "grad_norm": 2.598681688308716,
1961
+ "learning_rate": 1.2323451265557265e-06,
1962
+ "loss": 4.785,
1963
+ "step": 27900
1964
+ },
1965
+ {
1966
+ "epoch": 0.9788840721577402,
1967
+ "grad_norm": 2.9225780963897705,
1968
+ "learning_rate": 1.0575443993847015e-06,
1969
+ "loss": 4.7619,
1970
+ "step": 28000
1971
+ },
1972
+ {
1973
+ "epoch": 0.9823800867011607,
1974
+ "grad_norm": 2.758411407470703,
1975
+ "learning_rate": 8.827436722136765e-07,
1976
+ "loss": 4.8523,
1977
+ "step": 28100
1978
+ },
1979
+ {
1980
+ "epoch": 0.9858761012445811,
1981
+ "grad_norm": 3.0110673904418945,
1982
+ "learning_rate": 7.079429450426515e-07,
1983
+ "loss": 4.7437,
1984
+ "step": 28200
1985
+ },
1986
+ {
1987
+ "epoch": 0.9893721157880017,
1988
+ "grad_norm": 2.6037356853485107,
1989
+ "learning_rate": 5.331422178716264e-07,
1990
+ "loss": 4.7638,
1991
+ "step": 28300
1992
+ },
1993
+ {
1994
+ "epoch": 0.9928681303314222,
1995
+ "grad_norm": 3.267136335372925,
1996
+ "learning_rate": 3.5834149070060136e-07,
1997
+ "loss": 4.7422,
1998
+ "step": 28400
1999
+ },
2000
+ {
2001
+ "epoch": 0.9963641448748427,
2002
+ "grad_norm": 2.53847599029541,
2003
+ "learning_rate": 1.835407635295763e-07,
2004
+ "loss": 4.8052,
2005
+ "step": 28500
2006
+ },
2007
+ {
2008
+ "epoch": 0.9998601594182632,
2009
+ "grad_norm": 2.709245204925537,
2010
+ "learning_rate": 8.740036358551251e-09,
2011
+ "loss": 4.7659,
2012
+ "step": 28600
2013
+ }
2014
+ ],
2015
+ "logging_steps": 100,
2016
+ "max_steps": 28604,
2017
+ "num_input_tokens_seen": 0,
2018
+ "num_train_epochs": 1,
2019
+ "save_steps": 500,
2020
+ "stateful_callbacks": {
2021
+ "TrainerControl": {
2022
+ "args": {
2023
+ "should_epoch_stop": false,
2024
+ "should_evaluate": false,
2025
+ "should_log": false,
2026
+ "should_save": true,
2027
+ "should_training_stop": true
2028
+ },
2029
+ "attributes": {}
2030
+ }
2031
+ },
2032
+ "total_flos": 4.2974831149056e+16,
2033
+ "train_batch_size": 23,
2034
+ "trial_name": null,
2035
+ "trial_params": null
2036
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b5219e41e4dceacec8a12f41d37a63963f260daaa24f2ceec3aa4813f0f19a9b
3
+ size 5368
vocab.json ADDED
The diff for this file is too large to render. See raw diff