kcm133 commited on
Commit
505e911
·
verified ·
1 Parent(s): 96cd380

model : 0523013301

Browse files
config.json ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "distilbert/distilbert-base-uncased",
3
+ "activation": "gelu",
4
+ "architectures": [
5
+ "DistilBertForSequenceClassification"
6
+ ],
7
+ "attention_dropout": 0.1,
8
+ "dim": 768,
9
+ "dropout": 0.1,
10
+ "hidden_dim": 3072,
11
+ "id2label": {
12
+ "0": "Jersey Basic",
13
+ "1": "Under-, Nightwear",
14
+ "2": "Socks and Tights",
15
+ "3": "Jersey Fancy",
16
+ "4": "Accessories",
17
+ "5": "Trousers Denim",
18
+ "6": "Outdoor",
19
+ "7": "Shoes",
20
+ "8": "Swimwear",
21
+ "9": "Knitwear",
22
+ "10": "Shirts",
23
+ "11": "Trousers",
24
+ "12": "Dressed",
25
+ "13": "Shorts",
26
+ "14": "Dresses Ladies",
27
+ "15": "Skirts",
28
+ "16": "Special Offers",
29
+ "17": "Blouses",
30
+ "18": "Woven/Jersey/Knitted mix Baby",
31
+ "19": "Dresses/Skirts girls"
32
+ },
33
+ "initializer_range": 0.02,
34
+ "label2id": {
35
+ "Accessories": 4,
36
+ "Blouses": 17,
37
+ "Dressed": 12,
38
+ "Dresses Ladies": 14,
39
+ "Dresses/Skirts girls": 19,
40
+ "Jersey Basic": 0,
41
+ "Jersey Fancy": 3,
42
+ "Knitwear": 9,
43
+ "Outdoor": 6,
44
+ "Shirts": 10,
45
+ "Shoes": 7,
46
+ "Shorts": 13,
47
+ "Skirts": 15,
48
+ "Socks and Tights": 2,
49
+ "Special Offers": 16,
50
+ "Swimwear": 8,
51
+ "Trousers": 11,
52
+ "Trousers Denim": 5,
53
+ "Under-, Nightwear": 1,
54
+ "Woven/Jersey/Knitted mix Baby": 18
55
+ },
56
+ "max_position_embeddings": 512,
57
+ "model_type": "distilbert",
58
+ "n_heads": 12,
59
+ "n_layers": 6,
60
+ "pad_token_id": 0,
61
+ "problem_type": "single_label_classification",
62
+ "qa_dropout": 0.1,
63
+ "seq_classif_dropout": 0.2,
64
+ "sinusoidal_pos_embds": false,
65
+ "tie_weights_": true,
66
+ "torch_dtype": "float32",
67
+ "transformers_version": "4.41.0",
68
+ "vocab_size": 30522
69
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:291f893940f0965009475392375fa0456885a46f72341e28a479d4c3ae1e90fa
3
+ size 267887936
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:675207e42cb9771234a18cd05e4157e1010d362a8c9da9b6abb3d19bb9a54f8e
3
+ size 535838010
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76d2b21de1f75cf4cfeaae79e1d9149037272d522e6a1fd7c551b4028a9f7e25
3
+ size 14244
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7af99a8771cbf6f19faee7183ef06b0cb12033a679e66ad2f10dc3b2030aef2d
3
+ size 1064
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": true,
45
+ "cls_token": "[CLS]",
46
+ "do_lower_case": true,
47
+ "mask_token": "[MASK]",
48
+ "model_max_length": 512,
49
+ "pad_token": "[PAD]",
50
+ "sep_token": "[SEP]",
51
+ "strip_accents": null,
52
+ "tokenize_chinese_chars": true,
53
+ "tokenizer_class": "DistilBertTokenizer",
54
+ "unk_token": "[UNK]"
55
+ }
trainer_state.json ADDED
@@ -0,0 +1,1686 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.31098222732543945,
3
+ "best_model_checkpoint": "./seq_clf/0523013301/checkpoint-24528",
4
+ "epoch": 25.0,
5
+ "eval_steps": 500,
6
+ "global_step": 102200,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.1223091976516634,
13
+ "grad_norm": 3.0948333740234375,
14
+ "learning_rate": 1.990215264187867e-05,
15
+ "loss": 1.4987,
16
+ "step": 500
17
+ },
18
+ {
19
+ "epoch": 0.2446183953033268,
20
+ "grad_norm": 4.519032955169678,
21
+ "learning_rate": 1.9804305283757338e-05,
22
+ "loss": 0.8512,
23
+ "step": 1000
24
+ },
25
+ {
26
+ "epoch": 0.3669275929549902,
27
+ "grad_norm": 6.118898868560791,
28
+ "learning_rate": 1.970645792563601e-05,
29
+ "loss": 0.7319,
30
+ "step": 1500
31
+ },
32
+ {
33
+ "epoch": 0.4892367906066536,
34
+ "grad_norm": 5.870689392089844,
35
+ "learning_rate": 1.9608610567514678e-05,
36
+ "loss": 0.673,
37
+ "step": 2000
38
+ },
39
+ {
40
+ "epoch": 0.6115459882583171,
41
+ "grad_norm": 5.046342849731445,
42
+ "learning_rate": 1.951076320939335e-05,
43
+ "loss": 0.6408,
44
+ "step": 2500
45
+ },
46
+ {
47
+ "epoch": 0.7338551859099804,
48
+ "grad_norm": 5.137794494628906,
49
+ "learning_rate": 1.9412915851272015e-05,
50
+ "loss": 0.6029,
51
+ "step": 3000
52
+ },
53
+ {
54
+ "epoch": 0.8561643835616438,
55
+ "grad_norm": 5.616779327392578,
56
+ "learning_rate": 1.9315068493150686e-05,
57
+ "loss": 0.5867,
58
+ "step": 3500
59
+ },
60
+ {
61
+ "epoch": 0.9784735812133072,
62
+ "grad_norm": 3.4356954097747803,
63
+ "learning_rate": 1.9217221135029354e-05,
64
+ "loss": 0.5654,
65
+ "step": 4000
66
+ },
67
+ {
68
+ "epoch": 1.0,
69
+ "eval_accuracy": 0.8249802208386364,
70
+ "eval_loss": 0.5318922400474548,
71
+ "eval_runtime": 77.9312,
72
+ "eval_samples_per_second": 373.034,
73
+ "eval_steps_per_second": 11.664,
74
+ "step": 4088
75
+ },
76
+ {
77
+ "epoch": 1.1007827788649707,
78
+ "grad_norm": 5.042200088500977,
79
+ "learning_rate": 1.9119373776908026e-05,
80
+ "loss": 0.5171,
81
+ "step": 4500
82
+ },
83
+ {
84
+ "epoch": 1.223091976516634,
85
+ "grad_norm": 5.630020618438721,
86
+ "learning_rate": 1.9021526418786694e-05,
87
+ "loss": 0.4983,
88
+ "step": 5000
89
+ },
90
+ {
91
+ "epoch": 1.3454011741682974,
92
+ "grad_norm": 3.5620667934417725,
93
+ "learning_rate": 1.8923679060665363e-05,
94
+ "loss": 0.488,
95
+ "step": 5500
96
+ },
97
+ {
98
+ "epoch": 1.467710371819961,
99
+ "grad_norm": 6.017724990844727,
100
+ "learning_rate": 1.8825831702544034e-05,
101
+ "loss": 0.4867,
102
+ "step": 6000
103
+ },
104
+ {
105
+ "epoch": 1.5900195694716244,
106
+ "grad_norm": 4.974002361297607,
107
+ "learning_rate": 1.8727984344422703e-05,
108
+ "loss": 0.4645,
109
+ "step": 6500
110
+ },
111
+ {
112
+ "epoch": 1.7123287671232876,
113
+ "grad_norm": 5.986278057098389,
114
+ "learning_rate": 1.863013698630137e-05,
115
+ "loss": 0.4582,
116
+ "step": 7000
117
+ },
118
+ {
119
+ "epoch": 1.8346379647749511,
120
+ "grad_norm": 5.255044937133789,
121
+ "learning_rate": 1.853228962818004e-05,
122
+ "loss": 0.4609,
123
+ "step": 7500
124
+ },
125
+ {
126
+ "epoch": 1.9569471624266144,
127
+ "grad_norm": 5.023875713348389,
128
+ "learning_rate": 1.843444227005871e-05,
129
+ "loss": 0.4495,
130
+ "step": 8000
131
+ },
132
+ {
133
+ "epoch": 2.0,
134
+ "eval_accuracy": 0.8506759313405111,
135
+ "eval_loss": 0.4426937699317932,
136
+ "eval_runtime": 78.0987,
137
+ "eval_samples_per_second": 372.234,
138
+ "eval_steps_per_second": 11.639,
139
+ "step": 8176
140
+ },
141
+ {
142
+ "epoch": 2.079256360078278,
143
+ "grad_norm": 6.947430610656738,
144
+ "learning_rate": 1.833659491193738e-05,
145
+ "loss": 0.4058,
146
+ "step": 8500
147
+ },
148
+ {
149
+ "epoch": 2.2015655577299413,
150
+ "grad_norm": 2.9549953937530518,
151
+ "learning_rate": 1.8238747553816047e-05,
152
+ "loss": 0.3764,
153
+ "step": 9000
154
+ },
155
+ {
156
+ "epoch": 2.323874755381605,
157
+ "grad_norm": 6.063886642456055,
158
+ "learning_rate": 1.814090019569472e-05,
159
+ "loss": 0.3785,
160
+ "step": 9500
161
+ },
162
+ {
163
+ "epoch": 2.446183953033268,
164
+ "grad_norm": 6.911349296569824,
165
+ "learning_rate": 1.8043052837573387e-05,
166
+ "loss": 0.3736,
167
+ "step": 10000
168
+ },
169
+ {
170
+ "epoch": 2.5684931506849313,
171
+ "grad_norm": 5.118221282958984,
172
+ "learning_rate": 1.7945205479452055e-05,
173
+ "loss": 0.3684,
174
+ "step": 10500
175
+ },
176
+ {
177
+ "epoch": 2.690802348336595,
178
+ "grad_norm": 4.369062423706055,
179
+ "learning_rate": 1.7847358121330724e-05,
180
+ "loss": 0.3607,
181
+ "step": 11000
182
+ },
183
+ {
184
+ "epoch": 2.8131115459882583,
185
+ "grad_norm": 6.040197849273682,
186
+ "learning_rate": 1.7749510763209395e-05,
187
+ "loss": 0.3657,
188
+ "step": 11500
189
+ },
190
+ {
191
+ "epoch": 2.935420743639922,
192
+ "grad_norm": 6.751057147979736,
193
+ "learning_rate": 1.7651663405088064e-05,
194
+ "loss": 0.3517,
195
+ "step": 12000
196
+ },
197
+ {
198
+ "epoch": 3.0,
199
+ "eval_accuracy": 0.8710398679096006,
200
+ "eval_loss": 0.3752569854259491,
201
+ "eval_runtime": 78.0858,
202
+ "eval_samples_per_second": 372.295,
203
+ "eval_steps_per_second": 11.641,
204
+ "step": 12264
205
+ },
206
+ {
207
+ "epoch": 3.0577299412915853,
208
+ "grad_norm": 5.28397798538208,
209
+ "learning_rate": 1.7553816046966735e-05,
210
+ "loss": 0.3172,
211
+ "step": 12500
212
+ },
213
+ {
214
+ "epoch": 3.1800391389432487,
215
+ "grad_norm": 3.873892068862915,
216
+ "learning_rate": 1.74559686888454e-05,
217
+ "loss": 0.2865,
218
+ "step": 13000
219
+ },
220
+ {
221
+ "epoch": 3.302348336594912,
222
+ "grad_norm": 3.584193468093872,
223
+ "learning_rate": 1.735812133072407e-05,
224
+ "loss": 0.285,
225
+ "step": 13500
226
+ },
227
+ {
228
+ "epoch": 3.4246575342465753,
229
+ "grad_norm": 6.080500602722168,
230
+ "learning_rate": 1.726027397260274e-05,
231
+ "loss": 0.2867,
232
+ "step": 14000
233
+ },
234
+ {
235
+ "epoch": 3.5469667318982387,
236
+ "grad_norm": 5.05246114730835,
237
+ "learning_rate": 1.716242661448141e-05,
238
+ "loss": 0.2793,
239
+ "step": 14500
240
+ },
241
+ {
242
+ "epoch": 3.6692759295499022,
243
+ "grad_norm": 4.842950344085693,
244
+ "learning_rate": 1.706457925636008e-05,
245
+ "loss": 0.2829,
246
+ "step": 15000
247
+ },
248
+ {
249
+ "epoch": 3.7915851272015657,
250
+ "grad_norm": 6.52248477935791,
251
+ "learning_rate": 1.6966731898238748e-05,
252
+ "loss": 0.2776,
253
+ "step": 15500
254
+ },
255
+ {
256
+ "epoch": 3.9138943248532287,
257
+ "grad_norm": 5.814143657684326,
258
+ "learning_rate": 1.686888454011742e-05,
259
+ "loss": 0.268,
260
+ "step": 16000
261
+ },
262
+ {
263
+ "epoch": 4.0,
264
+ "eval_accuracy": 0.8836985311822779,
265
+ "eval_loss": 0.33874690532684326,
266
+ "eval_runtime": 78.7296,
267
+ "eval_samples_per_second": 369.251,
268
+ "eval_steps_per_second": 11.546,
269
+ "step": 16352
270
+ },
271
+ {
272
+ "epoch": 4.036203522504892,
273
+ "grad_norm": 4.4718194007873535,
274
+ "learning_rate": 1.6771037181996088e-05,
275
+ "loss": 0.2524,
276
+ "step": 16500
277
+ },
278
+ {
279
+ "epoch": 4.158512720156556,
280
+ "grad_norm": 6.8225531578063965,
281
+ "learning_rate": 1.6673189823874756e-05,
282
+ "loss": 0.2121,
283
+ "step": 17000
284
+ },
285
+ {
286
+ "epoch": 4.280821917808219,
287
+ "grad_norm": 4.292703151702881,
288
+ "learning_rate": 1.6575342465753425e-05,
289
+ "loss": 0.2168,
290
+ "step": 17500
291
+ },
292
+ {
293
+ "epoch": 4.403131115459883,
294
+ "grad_norm": 6.740828514099121,
295
+ "learning_rate": 1.6477495107632096e-05,
296
+ "loss": 0.2079,
297
+ "step": 18000
298
+ },
299
+ {
300
+ "epoch": 4.525440313111546,
301
+ "grad_norm": 7.279580593109131,
302
+ "learning_rate": 1.6379647749510764e-05,
303
+ "loss": 0.2178,
304
+ "step": 18500
305
+ },
306
+ {
307
+ "epoch": 4.64774951076321,
308
+ "grad_norm": 8.839160919189453,
309
+ "learning_rate": 1.6281800391389433e-05,
310
+ "loss": 0.2135,
311
+ "step": 19000
312
+ },
313
+ {
314
+ "epoch": 4.770058708414873,
315
+ "grad_norm": 5.748088836669922,
316
+ "learning_rate": 1.6183953033268104e-05,
317
+ "loss": 0.215,
318
+ "step": 19500
319
+ },
320
+ {
321
+ "epoch": 4.892367906066536,
322
+ "grad_norm": 6.222439765930176,
323
+ "learning_rate": 1.6086105675146773e-05,
324
+ "loss": 0.2114,
325
+ "step": 20000
326
+ },
327
+ {
328
+ "epoch": 5.0,
329
+ "eval_accuracy": 0.895531629458911,
330
+ "eval_loss": 0.3136844038963318,
331
+ "eval_runtime": 78.002,
332
+ "eval_samples_per_second": 372.695,
333
+ "eval_steps_per_second": 11.654,
334
+ "step": 20440
335
+ },
336
+ {
337
+ "epoch": 5.014677103718199,
338
+ "grad_norm": 5.067689895629883,
339
+ "learning_rate": 1.598825831702544e-05,
340
+ "loss": 0.2001,
341
+ "step": 20500
342
+ },
343
+ {
344
+ "epoch": 5.136986301369863,
345
+ "grad_norm": 5.334500789642334,
346
+ "learning_rate": 1.589041095890411e-05,
347
+ "loss": 0.1574,
348
+ "step": 21000
349
+ },
350
+ {
351
+ "epoch": 5.259295499021526,
352
+ "grad_norm": 8.226490020751953,
353
+ "learning_rate": 1.579256360078278e-05,
354
+ "loss": 0.1599,
355
+ "step": 21500
356
+ },
357
+ {
358
+ "epoch": 5.38160469667319,
359
+ "grad_norm": 5.279560565948486,
360
+ "learning_rate": 1.569471624266145e-05,
361
+ "loss": 0.1594,
362
+ "step": 22000
363
+ },
364
+ {
365
+ "epoch": 5.503913894324853,
366
+ "grad_norm": 8.88842487335205,
367
+ "learning_rate": 1.559686888454012e-05,
368
+ "loss": 0.1577,
369
+ "step": 22500
370
+ },
371
+ {
372
+ "epoch": 5.626223091976517,
373
+ "grad_norm": 6.64274787902832,
374
+ "learning_rate": 1.5499021526418785e-05,
375
+ "loss": 0.1566,
376
+ "step": 23000
377
+ },
378
+ {
379
+ "epoch": 5.74853228962818,
380
+ "grad_norm": 8.864998817443848,
381
+ "learning_rate": 1.5401174168297457e-05,
382
+ "loss": 0.163,
383
+ "step": 23500
384
+ },
385
+ {
386
+ "epoch": 5.870841487279844,
387
+ "grad_norm": 4.494806289672852,
388
+ "learning_rate": 1.5303326810176125e-05,
389
+ "loss": 0.159,
390
+ "step": 24000
391
+ },
392
+ {
393
+ "epoch": 5.993150684931507,
394
+ "grad_norm": 8.841043472290039,
395
+ "learning_rate": 1.5205479452054797e-05,
396
+ "loss": 0.1593,
397
+ "step": 24500
398
+ },
399
+ {
400
+ "epoch": 6.0,
401
+ "eval_accuracy": 0.9032025042138213,
402
+ "eval_loss": 0.31098222732543945,
403
+ "eval_runtime": 78.1256,
404
+ "eval_samples_per_second": 372.106,
405
+ "eval_steps_per_second": 11.635,
406
+ "step": 24528
407
+ },
408
+ {
409
+ "epoch": 6.1154598825831705,
410
+ "grad_norm": 5.516913890838623,
411
+ "learning_rate": 1.5107632093933464e-05,
412
+ "loss": 0.1186,
413
+ "step": 25000
414
+ },
415
+ {
416
+ "epoch": 6.237769080234834,
417
+ "grad_norm": 17.19109344482422,
418
+ "learning_rate": 1.5009784735812134e-05,
419
+ "loss": 0.1242,
420
+ "step": 25500
421
+ },
422
+ {
423
+ "epoch": 6.3600782778864975,
424
+ "grad_norm": 4.817780494689941,
425
+ "learning_rate": 1.4911937377690804e-05,
426
+ "loss": 0.1264,
427
+ "step": 26000
428
+ },
429
+ {
430
+ "epoch": 6.48238747553816,
431
+ "grad_norm": 9.93541431427002,
432
+ "learning_rate": 1.4814090019569473e-05,
433
+ "loss": 0.1251,
434
+ "step": 26500
435
+ },
436
+ {
437
+ "epoch": 6.604696673189824,
438
+ "grad_norm": 7.402759552001953,
439
+ "learning_rate": 1.4716242661448142e-05,
440
+ "loss": 0.1205,
441
+ "step": 27000
442
+ },
443
+ {
444
+ "epoch": 6.727005870841487,
445
+ "grad_norm": 4.997498989105225,
446
+ "learning_rate": 1.4618395303326812e-05,
447
+ "loss": 0.1221,
448
+ "step": 27500
449
+ },
450
+ {
451
+ "epoch": 6.8493150684931505,
452
+ "grad_norm": 8.2695951461792,
453
+ "learning_rate": 1.4520547945205482e-05,
454
+ "loss": 0.126,
455
+ "step": 28000
456
+ },
457
+ {
458
+ "epoch": 6.971624266144814,
459
+ "grad_norm": 7.226652145385742,
460
+ "learning_rate": 1.4422700587084152e-05,
461
+ "loss": 0.1238,
462
+ "step": 28500
463
+ },
464
+ {
465
+ "epoch": 7.0,
466
+ "eval_accuracy": 0.9069863437790238,
467
+ "eval_loss": 0.3151151239871979,
468
+ "eval_runtime": 77.3038,
469
+ "eval_samples_per_second": 376.062,
470
+ "eval_steps_per_second": 11.759,
471
+ "step": 28616
472
+ },
473
+ {
474
+ "epoch": 7.0939334637964775,
475
+ "grad_norm": 6.771386623382568,
476
+ "learning_rate": 1.4324853228962818e-05,
477
+ "loss": 0.1014,
478
+ "step": 29000
479
+ },
480
+ {
481
+ "epoch": 7.216242661448141,
482
+ "grad_norm": 7.908430099487305,
483
+ "learning_rate": 1.4227005870841488e-05,
484
+ "loss": 0.0917,
485
+ "step": 29500
486
+ },
487
+ {
488
+ "epoch": 7.3385518590998045,
489
+ "grad_norm": 11.553496360778809,
490
+ "learning_rate": 1.4129158512720158e-05,
491
+ "loss": 0.0989,
492
+ "step": 30000
493
+ },
494
+ {
495
+ "epoch": 7.460861056751468,
496
+ "grad_norm": 5.48296594619751,
497
+ "learning_rate": 1.4031311154598828e-05,
498
+ "loss": 0.0945,
499
+ "step": 30500
500
+ },
501
+ {
502
+ "epoch": 7.583170254403131,
503
+ "grad_norm": 4.382979869842529,
504
+ "learning_rate": 1.3933463796477496e-05,
505
+ "loss": 0.0904,
506
+ "step": 31000
507
+ },
508
+ {
509
+ "epoch": 7.705479452054795,
510
+ "grad_norm": 4.479061126708984,
511
+ "learning_rate": 1.3835616438356164e-05,
512
+ "loss": 0.1003,
513
+ "step": 31500
514
+ },
515
+ {
516
+ "epoch": 7.8277886497064575,
517
+ "grad_norm": 2.2252697944641113,
518
+ "learning_rate": 1.3737769080234834e-05,
519
+ "loss": 0.0963,
520
+ "step": 32000
521
+ },
522
+ {
523
+ "epoch": 7.950097847358121,
524
+ "grad_norm": 5.149953842163086,
525
+ "learning_rate": 1.3639921722113504e-05,
526
+ "loss": 0.0969,
527
+ "step": 32500
528
+ },
529
+ {
530
+ "epoch": 8.0,
531
+ "eval_accuracy": 0.9127652987513329,
532
+ "eval_loss": 0.3146636486053467,
533
+ "eval_runtime": 77.5909,
534
+ "eval_samples_per_second": 374.67,
535
+ "eval_steps_per_second": 11.715,
536
+ "step": 32704
537
+ },
538
+ {
539
+ "epoch": 8.072407045009784,
540
+ "grad_norm": 10.789752960205078,
541
+ "learning_rate": 1.3542074363992173e-05,
542
+ "loss": 0.0788,
543
+ "step": 33000
544
+ },
545
+ {
546
+ "epoch": 8.194716242661448,
547
+ "grad_norm": 5.515634536743164,
548
+ "learning_rate": 1.3444227005870843e-05,
549
+ "loss": 0.0717,
550
+ "step": 33500
551
+ },
552
+ {
553
+ "epoch": 8.317025440313111,
554
+ "grad_norm": 10.454425811767578,
555
+ "learning_rate": 1.3346379647749513e-05,
556
+ "loss": 0.0719,
557
+ "step": 34000
558
+ },
559
+ {
560
+ "epoch": 8.439334637964775,
561
+ "grad_norm": 7.216288089752197,
562
+ "learning_rate": 1.3248532289628183e-05,
563
+ "loss": 0.0764,
564
+ "step": 34500
565
+ },
566
+ {
567
+ "epoch": 8.561643835616438,
568
+ "grad_norm": 6.058413028717041,
569
+ "learning_rate": 1.3150684931506849e-05,
570
+ "loss": 0.0775,
571
+ "step": 35000
572
+ },
573
+ {
574
+ "epoch": 8.683953033268102,
575
+ "grad_norm": 1.9321287870407104,
576
+ "learning_rate": 1.3052837573385519e-05,
577
+ "loss": 0.0762,
578
+ "step": 35500
579
+ },
580
+ {
581
+ "epoch": 8.806262230919765,
582
+ "grad_norm": 5.581308841705322,
583
+ "learning_rate": 1.2954990215264189e-05,
584
+ "loss": 0.0763,
585
+ "step": 36000
586
+ },
587
+ {
588
+ "epoch": 8.928571428571429,
589
+ "grad_norm": 8.296586036682129,
590
+ "learning_rate": 1.2857142857142859e-05,
591
+ "loss": 0.0826,
592
+ "step": 36500
593
+ },
594
+ {
595
+ "epoch": 9.0,
596
+ "eval_accuracy": 0.9133156754153624,
597
+ "eval_loss": 0.33166706562042236,
598
+ "eval_runtime": 77.4584,
599
+ "eval_samples_per_second": 375.311,
600
+ "eval_steps_per_second": 11.735,
601
+ "step": 36792
602
+ },
603
+ {
604
+ "epoch": 9.050880626223092,
605
+ "grad_norm": 6.40161657333374,
606
+ "learning_rate": 1.2759295499021527e-05,
607
+ "loss": 0.0689,
608
+ "step": 37000
609
+ },
610
+ {
611
+ "epoch": 9.173189823874756,
612
+ "grad_norm": 9.357067108154297,
613
+ "learning_rate": 1.2661448140900197e-05,
614
+ "loss": 0.0581,
615
+ "step": 37500
616
+ },
617
+ {
618
+ "epoch": 9.29549902152642,
619
+ "grad_norm": 4.852538108825684,
620
+ "learning_rate": 1.2563600782778867e-05,
621
+ "loss": 0.0582,
622
+ "step": 38000
623
+ },
624
+ {
625
+ "epoch": 9.417808219178083,
626
+ "grad_norm": 6.204359531402588,
627
+ "learning_rate": 1.2465753424657537e-05,
628
+ "loss": 0.06,
629
+ "step": 38500
630
+ },
631
+ {
632
+ "epoch": 9.540117416829746,
633
+ "grad_norm": 11.658947944641113,
634
+ "learning_rate": 1.2367906066536204e-05,
635
+ "loss": 0.0626,
636
+ "step": 39000
637
+ },
638
+ {
639
+ "epoch": 9.66242661448141,
640
+ "grad_norm": 12.047215461730957,
641
+ "learning_rate": 1.2270058708414874e-05,
642
+ "loss": 0.0626,
643
+ "step": 39500
644
+ },
645
+ {
646
+ "epoch": 9.784735812133073,
647
+ "grad_norm": 11.078405380249023,
648
+ "learning_rate": 1.2172211350293543e-05,
649
+ "loss": 0.0655,
650
+ "step": 40000
651
+ },
652
+ {
653
+ "epoch": 9.907045009784735,
654
+ "grad_norm": 3.5147736072540283,
655
+ "learning_rate": 1.2074363992172213e-05,
656
+ "loss": 0.0663,
657
+ "step": 40500
658
+ },
659
+ {
660
+ "epoch": 10.0,
661
+ "eval_accuracy": 0.9173747033125795,
662
+ "eval_loss": 0.34417206048965454,
663
+ "eval_runtime": 77.3814,
664
+ "eval_samples_per_second": 375.685,
665
+ "eval_steps_per_second": 11.747,
666
+ "step": 40880
667
+ },
668
+ {
669
+ "epoch": 10.029354207436398,
670
+ "grad_norm": 2.839688539505005,
671
+ "learning_rate": 1.1976516634050882e-05,
672
+ "loss": 0.0609,
673
+ "step": 41000
674
+ },
675
+ {
676
+ "epoch": 10.151663405088062,
677
+ "grad_norm": 10.794103622436523,
678
+ "learning_rate": 1.187866927592955e-05,
679
+ "loss": 0.0467,
680
+ "step": 41500
681
+ },
682
+ {
683
+ "epoch": 10.273972602739725,
684
+ "grad_norm": 3.0600426197052,
685
+ "learning_rate": 1.178082191780822e-05,
686
+ "loss": 0.0464,
687
+ "step": 42000
688
+ },
689
+ {
690
+ "epoch": 10.396281800391389,
691
+ "grad_norm": 7.135795593261719,
692
+ "learning_rate": 1.168297455968689e-05,
693
+ "loss": 0.051,
694
+ "step": 42500
695
+ },
696
+ {
697
+ "epoch": 10.518590998043052,
698
+ "grad_norm": 5.585446834564209,
699
+ "learning_rate": 1.1585127201565558e-05,
700
+ "loss": 0.0531,
701
+ "step": 43000
702
+ },
703
+ {
704
+ "epoch": 10.640900195694716,
705
+ "grad_norm": 0.05372029170393944,
706
+ "learning_rate": 1.1487279843444228e-05,
707
+ "loss": 0.0504,
708
+ "step": 43500
709
+ },
710
+ {
711
+ "epoch": 10.76320939334638,
712
+ "grad_norm": 12.072264671325684,
713
+ "learning_rate": 1.1389432485322898e-05,
714
+ "loss": 0.0511,
715
+ "step": 44000
716
+ },
717
+ {
718
+ "epoch": 10.885518590998043,
719
+ "grad_norm": 5.787899971008301,
720
+ "learning_rate": 1.1291585127201568e-05,
721
+ "loss": 0.0519,
722
+ "step": 44500
723
+ },
724
+ {
725
+ "epoch": 11.0,
726
+ "eval_accuracy": 0.9171339135220666,
727
+ "eval_loss": 0.3852131962776184,
728
+ "eval_runtime": 77.3953,
729
+ "eval_samples_per_second": 375.617,
730
+ "eval_steps_per_second": 11.745,
731
+ "step": 44968
732
+ },
733
+ {
734
+ "epoch": 11.007827788649706,
735
+ "grad_norm": 8.351863861083984,
736
+ "learning_rate": 1.1193737769080235e-05,
737
+ "loss": 0.0508,
738
+ "step": 45000
739
+ },
740
+ {
741
+ "epoch": 11.13013698630137,
742
+ "grad_norm": 6.567861080169678,
743
+ "learning_rate": 1.1095890410958904e-05,
744
+ "loss": 0.0403,
745
+ "step": 45500
746
+ },
747
+ {
748
+ "epoch": 11.252446183953033,
749
+ "grad_norm": 5.368165016174316,
750
+ "learning_rate": 1.0998043052837574e-05,
751
+ "loss": 0.0419,
752
+ "step": 46000
753
+ },
754
+ {
755
+ "epoch": 11.374755381604697,
756
+ "grad_norm": 11.540104866027832,
757
+ "learning_rate": 1.0900195694716244e-05,
758
+ "loss": 0.0466,
759
+ "step": 46500
760
+ },
761
+ {
762
+ "epoch": 11.49706457925636,
763
+ "grad_norm": 2.5703656673431396,
764
+ "learning_rate": 1.0802348336594913e-05,
765
+ "loss": 0.0422,
766
+ "step": 47000
767
+ },
768
+ {
769
+ "epoch": 11.619373776908024,
770
+ "grad_norm": 10.161659240722656,
771
+ "learning_rate": 1.0704500978473583e-05,
772
+ "loss": 0.0492,
773
+ "step": 47500
774
+ },
775
+ {
776
+ "epoch": 11.741682974559687,
777
+ "grad_norm": 9.083320617675781,
778
+ "learning_rate": 1.0606653620352253e-05,
779
+ "loss": 0.0444,
780
+ "step": 48000
781
+ },
782
+ {
783
+ "epoch": 11.86399217221135,
784
+ "grad_norm": 7.005116939544678,
785
+ "learning_rate": 1.0508806262230922e-05,
786
+ "loss": 0.0431,
787
+ "step": 48500
788
+ },
789
+ {
790
+ "epoch": 11.986301369863014,
791
+ "grad_norm": 5.291450023651123,
792
+ "learning_rate": 1.0410958904109589e-05,
793
+ "loss": 0.045,
794
+ "step": 49000
795
+ },
796
+ {
797
+ "epoch": 12.0,
798
+ "eval_accuracy": 0.9189226376801624,
799
+ "eval_loss": 0.3933347463607788,
800
+ "eval_runtime": 77.3468,
801
+ "eval_samples_per_second": 375.852,
802
+ "eval_steps_per_second": 11.752,
803
+ "step": 49056
804
+ },
805
+ {
806
+ "epoch": 12.108610567514678,
807
+ "grad_norm": 3.0380940437316895,
808
+ "learning_rate": 1.0313111545988259e-05,
809
+ "loss": 0.0338,
810
+ "step": 49500
811
+ },
812
+ {
813
+ "epoch": 12.230919765166341,
814
+ "grad_norm": 0.6975650787353516,
815
+ "learning_rate": 1.0215264187866929e-05,
816
+ "loss": 0.0343,
817
+ "step": 50000
818
+ },
819
+ {
820
+ "epoch": 12.353228962818005,
821
+ "grad_norm": 0.18672548234462738,
822
+ "learning_rate": 1.0117416829745599e-05,
823
+ "loss": 0.0349,
824
+ "step": 50500
825
+ },
826
+ {
827
+ "epoch": 12.475538160469668,
828
+ "grad_norm": 6.868373394012451,
829
+ "learning_rate": 1.0019569471624267e-05,
830
+ "loss": 0.0343,
831
+ "step": 51000
832
+ },
833
+ {
834
+ "epoch": 12.597847358121331,
835
+ "grad_norm": 1.4025439023971558,
836
+ "learning_rate": 9.921722113502935e-06,
837
+ "loss": 0.0396,
838
+ "step": 51500
839
+ },
840
+ {
841
+ "epoch": 12.720156555772995,
842
+ "grad_norm": 4.942438125610352,
843
+ "learning_rate": 9.823874755381605e-06,
844
+ "loss": 0.0381,
845
+ "step": 52000
846
+ },
847
+ {
848
+ "epoch": 12.842465753424658,
849
+ "grad_norm": 4.044002056121826,
850
+ "learning_rate": 9.726027397260275e-06,
851
+ "loss": 0.0382,
852
+ "step": 52500
853
+ },
854
+ {
855
+ "epoch": 12.96477495107632,
856
+ "grad_norm": 0.6736146211624146,
857
+ "learning_rate": 9.628180039138944e-06,
858
+ "loss": 0.0368,
859
+ "step": 53000
860
+ },
861
+ {
862
+ "epoch": 13.0,
863
+ "eval_accuracy": 0.9228096728698704,
864
+ "eval_loss": 0.38157734274864197,
865
+ "eval_runtime": 77.3834,
866
+ "eval_samples_per_second": 375.675,
867
+ "eval_steps_per_second": 11.747,
868
+ "step": 53144
869
+ },
870
+ {
871
+ "epoch": 13.087084148727984,
872
+ "grad_norm": 1.1303954124450684,
873
+ "learning_rate": 9.530332681017614e-06,
874
+ "loss": 0.0306,
875
+ "step": 53500
876
+ },
877
+ {
878
+ "epoch": 13.209393346379647,
879
+ "grad_norm": 1.899757742881775,
880
+ "learning_rate": 9.432485322896282e-06,
881
+ "loss": 0.0302,
882
+ "step": 54000
883
+ },
884
+ {
885
+ "epoch": 13.33170254403131,
886
+ "grad_norm": 10.160910606384277,
887
+ "learning_rate": 9.334637964774952e-06,
888
+ "loss": 0.0284,
889
+ "step": 54500
890
+ },
891
+ {
892
+ "epoch": 13.454011741682974,
893
+ "grad_norm": 6.715288162231445,
894
+ "learning_rate": 9.23679060665362e-06,
895
+ "loss": 0.0351,
896
+ "step": 55000
897
+ },
898
+ {
899
+ "epoch": 13.576320939334638,
900
+ "grad_norm": 9.383013725280762,
901
+ "learning_rate": 9.13894324853229e-06,
902
+ "loss": 0.0342,
903
+ "step": 55500
904
+ },
905
+ {
906
+ "epoch": 13.698630136986301,
907
+ "grad_norm": 0.5024349093437195,
908
+ "learning_rate": 9.04109589041096e-06,
909
+ "loss": 0.0358,
910
+ "step": 56000
911
+ },
912
+ {
913
+ "epoch": 13.820939334637965,
914
+ "grad_norm": 6.495013236999512,
915
+ "learning_rate": 8.943248532289628e-06,
916
+ "loss": 0.0361,
917
+ "step": 56500
918
+ },
919
+ {
920
+ "epoch": 13.943248532289628,
921
+ "grad_norm": 3.360276699066162,
922
+ "learning_rate": 8.845401174168298e-06,
923
+ "loss": 0.0332,
924
+ "step": 57000
925
+ },
926
+ {
927
+ "epoch": 14.0,
928
+ "eval_accuracy": 0.9229816655773795,
929
+ "eval_loss": 0.4262143075466156,
930
+ "eval_runtime": 77.1924,
931
+ "eval_samples_per_second": 376.604,
932
+ "eval_steps_per_second": 11.776,
933
+ "step": 57232
934
+ },
935
+ {
936
+ "epoch": 14.065557729941291,
937
+ "grad_norm": 2.5697081089019775,
938
+ "learning_rate": 8.747553816046968e-06,
939
+ "loss": 0.0271,
940
+ "step": 57500
941
+ },
942
+ {
943
+ "epoch": 14.187866927592955,
944
+ "grad_norm": 2.5552070140838623,
945
+ "learning_rate": 8.649706457925636e-06,
946
+ "loss": 0.0229,
947
+ "step": 58000
948
+ },
949
+ {
950
+ "epoch": 14.310176125244618,
951
+ "grad_norm": 11.48392105102539,
952
+ "learning_rate": 8.551859099804306e-06,
953
+ "loss": 0.0275,
954
+ "step": 58500
955
+ },
956
+ {
957
+ "epoch": 14.432485322896282,
958
+ "grad_norm": 4.35728645324707,
959
+ "learning_rate": 8.454011741682975e-06,
960
+ "loss": 0.026,
961
+ "step": 59000
962
+ },
963
+ {
964
+ "epoch": 14.554794520547945,
965
+ "grad_norm": 18.579906463623047,
966
+ "learning_rate": 8.356164383561644e-06,
967
+ "loss": 0.0259,
968
+ "step": 59500
969
+ },
970
+ {
971
+ "epoch": 14.677103718199609,
972
+ "grad_norm": 7.566853046417236,
973
+ "learning_rate": 8.258317025440313e-06,
974
+ "loss": 0.0281,
975
+ "step": 60000
976
+ },
977
+ {
978
+ "epoch": 14.799412915851272,
979
+ "grad_norm": 0.4659290611743927,
980
+ "learning_rate": 8.160469667318983e-06,
981
+ "loss": 0.0283,
982
+ "step": 60500
983
+ },
984
+ {
985
+ "epoch": 14.921722113502936,
986
+ "grad_norm": 5.014148235321045,
987
+ "learning_rate": 8.062622309197653e-06,
988
+ "loss": 0.0289,
989
+ "step": 61000
990
+ },
991
+ {
992
+ "epoch": 15.0,
993
+ "eval_accuracy": 0.9236008393244126,
994
+ "eval_loss": 0.43280109763145447,
995
+ "eval_runtime": 77.9069,
996
+ "eval_samples_per_second": 373.151,
997
+ "eval_steps_per_second": 11.668,
998
+ "step": 61320
999
+ },
1000
+ {
1001
+ "epoch": 15.0440313111546,
1002
+ "grad_norm": 4.811933994293213,
1003
+ "learning_rate": 7.964774951076321e-06,
1004
+ "loss": 0.027,
1005
+ "step": 61500
1006
+ },
1007
+ {
1008
+ "epoch": 15.166340508806263,
1009
+ "grad_norm": 9.811365127563477,
1010
+ "learning_rate": 7.86692759295499e-06,
1011
+ "loss": 0.0214,
1012
+ "step": 62000
1013
+ },
1014
+ {
1015
+ "epoch": 15.288649706457926,
1016
+ "grad_norm": 1.3940095901489258,
1017
+ "learning_rate": 7.76908023483366e-06,
1018
+ "loss": 0.0219,
1019
+ "step": 62500
1020
+ },
1021
+ {
1022
+ "epoch": 15.41095890410959,
1023
+ "grad_norm": 0.06226726993918419,
1024
+ "learning_rate": 7.671232876712329e-06,
1025
+ "loss": 0.0226,
1026
+ "step": 63000
1027
+ },
1028
+ {
1029
+ "epoch": 15.533268101761253,
1030
+ "grad_norm": 8.294939994812012,
1031
+ "learning_rate": 7.573385518590999e-06,
1032
+ "loss": 0.0251,
1033
+ "step": 63500
1034
+ },
1035
+ {
1036
+ "epoch": 15.655577299412915,
1037
+ "grad_norm": 1.8952441215515137,
1038
+ "learning_rate": 7.475538160469667e-06,
1039
+ "loss": 0.0232,
1040
+ "step": 64000
1041
+ },
1042
+ {
1043
+ "epoch": 15.777886497064578,
1044
+ "grad_norm": 5.1830315589904785,
1045
+ "learning_rate": 7.377690802348337e-06,
1046
+ "loss": 0.0254,
1047
+ "step": 64500
1048
+ },
1049
+ {
1050
+ "epoch": 15.900195694716242,
1051
+ "grad_norm": 1.0473072528839111,
1052
+ "learning_rate": 7.279843444227006e-06,
1053
+ "loss": 0.0248,
1054
+ "step": 65000
1055
+ },
1056
+ {
1057
+ "epoch": 16.0,
1058
+ "eval_accuracy": 0.9260431357710434,
1059
+ "eval_loss": 0.4366961419582367,
1060
+ "eval_runtime": 77.3755,
1061
+ "eval_samples_per_second": 375.713,
1062
+ "eval_steps_per_second": 11.748,
1063
+ "step": 65408
1064
+ },
1065
+ {
1066
+ "epoch": 16.022504892367905,
1067
+ "grad_norm": 0.5687731504440308,
1068
+ "learning_rate": 7.181996086105676e-06,
1069
+ "loss": 0.0238,
1070
+ "step": 65500
1071
+ },
1072
+ {
1073
+ "epoch": 16.14481409001957,
1074
+ "grad_norm": 1.2872580289840698,
1075
+ "learning_rate": 7.0841487279843445e-06,
1076
+ "loss": 0.0194,
1077
+ "step": 66000
1078
+ },
1079
+ {
1080
+ "epoch": 16.267123287671232,
1081
+ "grad_norm": 5.059693813323975,
1082
+ "learning_rate": 6.9863013698630145e-06,
1083
+ "loss": 0.0199,
1084
+ "step": 66500
1085
+ },
1086
+ {
1087
+ "epoch": 16.389432485322896,
1088
+ "grad_norm": 6.738853931427002,
1089
+ "learning_rate": 6.8884540117416836e-06,
1090
+ "loss": 0.0205,
1091
+ "step": 67000
1092
+ },
1093
+ {
1094
+ "epoch": 16.51174168297456,
1095
+ "grad_norm": 11.091848373413086,
1096
+ "learning_rate": 6.790606653620353e-06,
1097
+ "loss": 0.0215,
1098
+ "step": 67500
1099
+ },
1100
+ {
1101
+ "epoch": 16.634050880626223,
1102
+ "grad_norm": 0.19167885184288025,
1103
+ "learning_rate": 6.692759295499022e-06,
1104
+ "loss": 0.022,
1105
+ "step": 68000
1106
+ },
1107
+ {
1108
+ "epoch": 16.756360078277886,
1109
+ "grad_norm": 5.966000080108643,
1110
+ "learning_rate": 6.594911937377692e-06,
1111
+ "loss": 0.0203,
1112
+ "step": 68500
1113
+ },
1114
+ {
1115
+ "epoch": 16.87866927592955,
1116
+ "grad_norm": 3.5066215991973877,
1117
+ "learning_rate": 6.49706457925636e-06,
1118
+ "loss": 0.0187,
1119
+ "step": 69000
1120
+ },
1121
+ {
1122
+ "epoch": 17.0,
1123
+ "eval_accuracy": 0.9272470847236077,
1124
+ "eval_loss": 0.44777625799179077,
1125
+ "eval_runtime": 77.1492,
1126
+ "eval_samples_per_second": 376.815,
1127
+ "eval_steps_per_second": 11.782,
1128
+ "step": 69496
1129
+ },
1130
+ {
1131
+ "epoch": 17.000978473581213,
1132
+ "grad_norm": 0.6858524680137634,
1133
+ "learning_rate": 6.39921722113503e-06,
1134
+ "loss": 0.0204,
1135
+ "step": 69500
1136
+ },
1137
+ {
1138
+ "epoch": 17.123287671232877,
1139
+ "grad_norm": 9.703988075256348,
1140
+ "learning_rate": 6.301369863013699e-06,
1141
+ "loss": 0.0161,
1142
+ "step": 70000
1143
+ },
1144
+ {
1145
+ "epoch": 17.24559686888454,
1146
+ "grad_norm": 4.51020622253418,
1147
+ "learning_rate": 6.203522504892369e-06,
1148
+ "loss": 0.016,
1149
+ "step": 70500
1150
+ },
1151
+ {
1152
+ "epoch": 17.367906066536204,
1153
+ "grad_norm": 0.09826533496379852,
1154
+ "learning_rate": 6.105675146771037e-06,
1155
+ "loss": 0.0177,
1156
+ "step": 71000
1157
+ },
1158
+ {
1159
+ "epoch": 17.490215264187867,
1160
+ "grad_norm": 11.620870590209961,
1161
+ "learning_rate": 6.007827788649707e-06,
1162
+ "loss": 0.02,
1163
+ "step": 71500
1164
+ },
1165
+ {
1166
+ "epoch": 17.61252446183953,
1167
+ "grad_norm": 8.661576271057129,
1168
+ "learning_rate": 5.909980430528376e-06,
1169
+ "loss": 0.017,
1170
+ "step": 72000
1171
+ },
1172
+ {
1173
+ "epoch": 17.734833659491194,
1174
+ "grad_norm": 0.7512950897216797,
1175
+ "learning_rate": 5.812133072407045e-06,
1176
+ "loss": 0.0186,
1177
+ "step": 72500
1178
+ },
1179
+ {
1180
+ "epoch": 17.857142857142858,
1181
+ "grad_norm": 0.44471457600593567,
1182
+ "learning_rate": 5.7142857142857145e-06,
1183
+ "loss": 0.0228,
1184
+ "step": 73000
1185
+ },
1186
+ {
1187
+ "epoch": 17.97945205479452,
1188
+ "grad_norm": 1.0926498174667358,
1189
+ "learning_rate": 5.6164383561643845e-06,
1190
+ "loss": 0.0178,
1191
+ "step": 73500
1192
+ },
1193
+ {
1194
+ "epoch": 18.0,
1195
+ "eval_accuracy": 0.9264559182690654,
1196
+ "eval_loss": 0.46360349655151367,
1197
+ "eval_runtime": 77.3263,
1198
+ "eval_samples_per_second": 375.952,
1199
+ "eval_steps_per_second": 11.755,
1200
+ "step": 73584
1201
+ },
1202
+ {
1203
+ "epoch": 18.101761252446185,
1204
+ "grad_norm": 8.134523391723633,
1205
+ "learning_rate": 5.518590998043053e-06,
1206
+ "loss": 0.0153,
1207
+ "step": 74000
1208
+ },
1209
+ {
1210
+ "epoch": 18.224070450097848,
1211
+ "grad_norm": 0.08053633570671082,
1212
+ "learning_rate": 5.420743639921723e-06,
1213
+ "loss": 0.016,
1214
+ "step": 74500
1215
+ },
1216
+ {
1217
+ "epoch": 18.34637964774951,
1218
+ "grad_norm": 0.44435006380081177,
1219
+ "learning_rate": 5.322896281800392e-06,
1220
+ "loss": 0.0146,
1221
+ "step": 75000
1222
+ },
1223
+ {
1224
+ "epoch": 18.468688845401175,
1225
+ "grad_norm": 2.0229480266571045,
1226
+ "learning_rate": 5.225048923679062e-06,
1227
+ "loss": 0.017,
1228
+ "step": 75500
1229
+ },
1230
+ {
1231
+ "epoch": 18.59099804305284,
1232
+ "grad_norm": 4.502400875091553,
1233
+ "learning_rate": 5.12720156555773e-06,
1234
+ "loss": 0.017,
1235
+ "step": 76000
1236
+ },
1237
+ {
1238
+ "epoch": 18.713307240704502,
1239
+ "grad_norm": 0.14978571236133575,
1240
+ "learning_rate": 5.0293542074364e-06,
1241
+ "loss": 0.0154,
1242
+ "step": 76500
1243
+ },
1244
+ {
1245
+ "epoch": 18.835616438356166,
1246
+ "grad_norm": 9.524068832397461,
1247
+ "learning_rate": 4.931506849315069e-06,
1248
+ "loss": 0.0165,
1249
+ "step": 77000
1250
+ },
1251
+ {
1252
+ "epoch": 18.95792563600783,
1253
+ "grad_norm": 0.9707121253013611,
1254
+ "learning_rate": 4.833659491193738e-06,
1255
+ "loss": 0.0154,
1256
+ "step": 77500
1257
+ },
1258
+ {
1259
+ "epoch": 19.0,
1260
+ "eval_accuracy": 0.9266623095180765,
1261
+ "eval_loss": 0.47839030623435974,
1262
+ "eval_runtime": 77.6682,
1263
+ "eval_samples_per_second": 374.297,
1264
+ "eval_steps_per_second": 11.704,
1265
+ "step": 77672
1266
+ },
1267
+ {
1268
+ "epoch": 19.080234833659492,
1269
+ "grad_norm": 11.346772193908691,
1270
+ "learning_rate": 4.735812133072407e-06,
1271
+ "loss": 0.0136,
1272
+ "step": 78000
1273
+ },
1274
+ {
1275
+ "epoch": 19.202544031311156,
1276
+ "grad_norm": 0.7873362302780151,
1277
+ "learning_rate": 4.637964774951076e-06,
1278
+ "loss": 0.0131,
1279
+ "step": 78500
1280
+ },
1281
+ {
1282
+ "epoch": 19.32485322896282,
1283
+ "grad_norm": 1.0734236240386963,
1284
+ "learning_rate": 4.5401174168297455e-06,
1285
+ "loss": 0.0156,
1286
+ "step": 79000
1287
+ },
1288
+ {
1289
+ "epoch": 19.447162426614483,
1290
+ "grad_norm": 0.031163902953267097,
1291
+ "learning_rate": 4.442270058708415e-06,
1292
+ "loss": 0.0148,
1293
+ "step": 79500
1294
+ },
1295
+ {
1296
+ "epoch": 19.569471624266146,
1297
+ "grad_norm": 3.0132200717926025,
1298
+ "learning_rate": 4.3444227005870845e-06,
1299
+ "loss": 0.0142,
1300
+ "step": 80000
1301
+ },
1302
+ {
1303
+ "epoch": 19.69178082191781,
1304
+ "grad_norm": 5.072915554046631,
1305
+ "learning_rate": 4.246575342465754e-06,
1306
+ "loss": 0.0133,
1307
+ "step": 80500
1308
+ },
1309
+ {
1310
+ "epoch": 19.81409001956947,
1311
+ "grad_norm": 0.12334894388914108,
1312
+ "learning_rate": 4.148727984344423e-06,
1313
+ "loss": 0.0153,
1314
+ "step": 81000
1315
+ },
1316
+ {
1317
+ "epoch": 19.936399217221137,
1318
+ "grad_norm": 3.8586535453796387,
1319
+ "learning_rate": 4.050880626223092e-06,
1320
+ "loss": 0.0138,
1321
+ "step": 81500
1322
+ },
1323
+ {
1324
+ "epoch": 20.0,
1325
+ "eval_accuracy": 0.9280726497196519,
1326
+ "eval_loss": 0.4880678355693817,
1327
+ "eval_runtime": 77.1562,
1328
+ "eval_samples_per_second": 376.781,
1329
+ "eval_steps_per_second": 11.781,
1330
+ "step": 81760
1331
+ },
1332
+ {
1333
+ "epoch": 20.058708414872797,
1334
+ "grad_norm": 5.422935485839844,
1335
+ "learning_rate": 3.953033268101762e-06,
1336
+ "loss": 0.0123,
1337
+ "step": 82000
1338
+ },
1339
+ {
1340
+ "epoch": 20.18101761252446,
1341
+ "grad_norm": 0.02908429130911827,
1342
+ "learning_rate": 3.855185909980431e-06,
1343
+ "loss": 0.0127,
1344
+ "step": 82500
1345
+ },
1346
+ {
1347
+ "epoch": 20.303326810176124,
1348
+ "grad_norm": 0.9683161973953247,
1349
+ "learning_rate": 3.7573385518591e-06,
1350
+ "loss": 0.0106,
1351
+ "step": 83000
1352
+ },
1353
+ {
1354
+ "epoch": 20.425636007827787,
1355
+ "grad_norm": 2.011960029602051,
1356
+ "learning_rate": 3.659491193737769e-06,
1357
+ "loss": 0.0129,
1358
+ "step": 83500
1359
+ },
1360
+ {
1361
+ "epoch": 20.54794520547945,
1362
+ "grad_norm": 0.011290821246802807,
1363
+ "learning_rate": 3.5616438356164386e-06,
1364
+ "loss": 0.0098,
1365
+ "step": 84000
1366
+ },
1367
+ {
1368
+ "epoch": 20.670254403131114,
1369
+ "grad_norm": 3.6643381118774414,
1370
+ "learning_rate": 3.4637964774951077e-06,
1371
+ "loss": 0.0123,
1372
+ "step": 84500
1373
+ },
1374
+ {
1375
+ "epoch": 20.792563600782778,
1376
+ "grad_norm": 0.28946495056152344,
1377
+ "learning_rate": 3.365949119373777e-06,
1378
+ "loss": 0.0119,
1379
+ "step": 85000
1380
+ },
1381
+ {
1382
+ "epoch": 20.91487279843444,
1383
+ "grad_norm": 0.8526332974433899,
1384
+ "learning_rate": 3.2681017612524463e-06,
1385
+ "loss": 0.0127,
1386
+ "step": 85500
1387
+ },
1388
+ {
1389
+ "epoch": 21.0,
1390
+ "eval_accuracy": 0.9281758453441574,
1391
+ "eval_loss": 0.49737828969955444,
1392
+ "eval_runtime": 77.6066,
1393
+ "eval_samples_per_second": 374.594,
1394
+ "eval_steps_per_second": 11.713,
1395
+ "step": 85848
1396
+ },
1397
+ {
1398
+ "epoch": 21.037181996086105,
1399
+ "grad_norm": 0.038626112043857574,
1400
+ "learning_rate": 3.1702544031311154e-06,
1401
+ "loss": 0.0133,
1402
+ "step": 86000
1403
+ },
1404
+ {
1405
+ "epoch": 21.159491193737768,
1406
+ "grad_norm": 1.5674916505813599,
1407
+ "learning_rate": 3.072407045009785e-06,
1408
+ "loss": 0.0079,
1409
+ "step": 86500
1410
+ },
1411
+ {
1412
+ "epoch": 21.28180039138943,
1413
+ "grad_norm": 0.023863431066274643,
1414
+ "learning_rate": 2.974559686888454e-06,
1415
+ "loss": 0.0089,
1416
+ "step": 87000
1417
+ },
1418
+ {
1419
+ "epoch": 21.404109589041095,
1420
+ "grad_norm": 0.027295144274830818,
1421
+ "learning_rate": 2.876712328767123e-06,
1422
+ "loss": 0.0099,
1423
+ "step": 87500
1424
+ },
1425
+ {
1426
+ "epoch": 21.52641878669276,
1427
+ "grad_norm": 1.8621610403060913,
1428
+ "learning_rate": 2.7788649706457927e-06,
1429
+ "loss": 0.01,
1430
+ "step": 88000
1431
+ },
1432
+ {
1433
+ "epoch": 21.648727984344422,
1434
+ "grad_norm": 0.09760987758636475,
1435
+ "learning_rate": 2.681017612524462e-06,
1436
+ "loss": 0.0115,
1437
+ "step": 88500
1438
+ },
1439
+ {
1440
+ "epoch": 21.771037181996086,
1441
+ "grad_norm": 0.0476505346596241,
1442
+ "learning_rate": 2.5831702544031313e-06,
1443
+ "loss": 0.0096,
1444
+ "step": 89000
1445
+ },
1446
+ {
1447
+ "epoch": 21.89334637964775,
1448
+ "grad_norm": 0.08127936720848083,
1449
+ "learning_rate": 2.4853228962818004e-06,
1450
+ "loss": 0.0116,
1451
+ "step": 89500
1452
+ },
1453
+ {
1454
+ "epoch": 22.0,
1455
+ "eval_accuracy": 0.9293797942967218,
1456
+ "eval_loss": 0.4935356378555298,
1457
+ "eval_runtime": 79.4825,
1458
+ "eval_samples_per_second": 365.754,
1459
+ "eval_steps_per_second": 11.436,
1460
+ "step": 89936
1461
+ },
1462
+ {
1463
+ "epoch": 22.015655577299412,
1464
+ "grad_norm": 1.2286592721939087,
1465
+ "learning_rate": 2.3874755381604695e-06,
1466
+ "loss": 0.0086,
1467
+ "step": 90000
1468
+ },
1469
+ {
1470
+ "epoch": 22.137964774951076,
1471
+ "grad_norm": 1.9946578741073608,
1472
+ "learning_rate": 2.289628180039139e-06,
1473
+ "loss": 0.008,
1474
+ "step": 90500
1475
+ },
1476
+ {
1477
+ "epoch": 22.26027397260274,
1478
+ "grad_norm": 0.024408530443906784,
1479
+ "learning_rate": 2.191780821917808e-06,
1480
+ "loss": 0.0115,
1481
+ "step": 91000
1482
+ },
1483
+ {
1484
+ "epoch": 22.382583170254403,
1485
+ "grad_norm": 0.011225187219679356,
1486
+ "learning_rate": 2.0939334637964777e-06,
1487
+ "loss": 0.0085,
1488
+ "step": 91500
1489
+ },
1490
+ {
1491
+ "epoch": 22.504892367906066,
1492
+ "grad_norm": 1.2969368696212769,
1493
+ "learning_rate": 1.996086105675147e-06,
1494
+ "loss": 0.0089,
1495
+ "step": 92000
1496
+ },
1497
+ {
1498
+ "epoch": 22.62720156555773,
1499
+ "grad_norm": 6.824276447296143,
1500
+ "learning_rate": 1.8982387475538161e-06,
1501
+ "loss": 0.0088,
1502
+ "step": 92500
1503
+ },
1504
+ {
1505
+ "epoch": 22.749510763209393,
1506
+ "grad_norm": 0.052455250173807144,
1507
+ "learning_rate": 1.8003913894324854e-06,
1508
+ "loss": 0.0088,
1509
+ "step": 93000
1510
+ },
1511
+ {
1512
+ "epoch": 22.871819960861057,
1513
+ "grad_norm": 0.04021551460027695,
1514
+ "learning_rate": 1.7025440313111545e-06,
1515
+ "loss": 0.009,
1516
+ "step": 93500
1517
+ },
1518
+ {
1519
+ "epoch": 22.99412915851272,
1520
+ "grad_norm": 0.06902284175157547,
1521
+ "learning_rate": 1.6046966731898239e-06,
1522
+ "loss": 0.0106,
1523
+ "step": 94000
1524
+ },
1525
+ {
1526
+ "epoch": 23.0,
1527
+ "eval_accuracy": 0.93065254033229,
1528
+ "eval_loss": 0.4948473870754242,
1529
+ "eval_runtime": 79.6762,
1530
+ "eval_samples_per_second": 364.864,
1531
+ "eval_steps_per_second": 11.409,
1532
+ "step": 94024
1533
+ },
1534
+ {
1535
+ "epoch": 23.116438356164384,
1536
+ "grad_norm": 0.5471883416175842,
1537
+ "learning_rate": 1.5068493150684932e-06,
1538
+ "loss": 0.0068,
1539
+ "step": 94500
1540
+ },
1541
+ {
1542
+ "epoch": 23.238747553816047,
1543
+ "grad_norm": 0.16019482910633087,
1544
+ "learning_rate": 1.4090019569471625e-06,
1545
+ "loss": 0.0089,
1546
+ "step": 95000
1547
+ },
1548
+ {
1549
+ "epoch": 23.36105675146771,
1550
+ "grad_norm": 6.4080963134765625,
1551
+ "learning_rate": 1.3111545988258318e-06,
1552
+ "loss": 0.0077,
1553
+ "step": 95500
1554
+ },
1555
+ {
1556
+ "epoch": 23.483365949119374,
1557
+ "grad_norm": 0.13792355358600616,
1558
+ "learning_rate": 1.213307240704501e-06,
1559
+ "loss": 0.0076,
1560
+ "step": 96000
1561
+ },
1562
+ {
1563
+ "epoch": 23.605675146771038,
1564
+ "grad_norm": 2.563711404800415,
1565
+ "learning_rate": 1.1154598825831702e-06,
1566
+ "loss": 0.0075,
1567
+ "step": 96500
1568
+ },
1569
+ {
1570
+ "epoch": 23.7279843444227,
1571
+ "grad_norm": 0.04809940978884697,
1572
+ "learning_rate": 1.0176125244618395e-06,
1573
+ "loss": 0.0067,
1574
+ "step": 97000
1575
+ },
1576
+ {
1577
+ "epoch": 23.850293542074365,
1578
+ "grad_norm": 0.057318106293678284,
1579
+ "learning_rate": 9.197651663405089e-07,
1580
+ "loss": 0.0062,
1581
+ "step": 97500
1582
+ },
1583
+ {
1584
+ "epoch": 23.972602739726028,
1585
+ "grad_norm": 4.935258865356445,
1586
+ "learning_rate": 8.219178082191781e-07,
1587
+ "loss": 0.0085,
1588
+ "step": 98000
1589
+ },
1590
+ {
1591
+ "epoch": 24.0,
1592
+ "eval_accuracy": 0.9311685184548175,
1593
+ "eval_loss": 0.49324169754981995,
1594
+ "eval_runtime": 79.011,
1595
+ "eval_samples_per_second": 367.936,
1596
+ "eval_steps_per_second": 11.505,
1597
+ "step": 98112
1598
+ },
1599
+ {
1600
+ "epoch": 24.09491193737769,
1601
+ "grad_norm": 6.268829345703125,
1602
+ "learning_rate": 7.240704500978474e-07,
1603
+ "loss": 0.0055,
1604
+ "step": 98500
1605
+ },
1606
+ {
1607
+ "epoch": 24.217221135029355,
1608
+ "grad_norm": 0.00456130551174283,
1609
+ "learning_rate": 6.262230919765167e-07,
1610
+ "loss": 0.0054,
1611
+ "step": 99000
1612
+ },
1613
+ {
1614
+ "epoch": 24.33953033268102,
1615
+ "grad_norm": 0.014587494544684887,
1616
+ "learning_rate": 5.283757338551859e-07,
1617
+ "loss": 0.0067,
1618
+ "step": 99500
1619
+ },
1620
+ {
1621
+ "epoch": 24.461839530332682,
1622
+ "grad_norm": 0.7037560343742371,
1623
+ "learning_rate": 4.305283757338552e-07,
1624
+ "loss": 0.0059,
1625
+ "step": 100000
1626
+ },
1627
+ {
1628
+ "epoch": 24.584148727984346,
1629
+ "grad_norm": 0.006079969462007284,
1630
+ "learning_rate": 3.326810176125245e-07,
1631
+ "loss": 0.0074,
1632
+ "step": 100500
1633
+ },
1634
+ {
1635
+ "epoch": 24.70645792563601,
1636
+ "grad_norm": 1.3056970834732056,
1637
+ "learning_rate": 2.3483365949119375e-07,
1638
+ "loss": 0.0075,
1639
+ "step": 101000
1640
+ },
1641
+ {
1642
+ "epoch": 24.828767123287673,
1643
+ "grad_norm": 8.87458324432373,
1644
+ "learning_rate": 1.36986301369863e-07,
1645
+ "loss": 0.0065,
1646
+ "step": 101500
1647
+ },
1648
+ {
1649
+ "epoch": 24.951076320939336,
1650
+ "grad_norm": 14.917941093444824,
1651
+ "learning_rate": 3.9138943248532294e-08,
1652
+ "loss": 0.0072,
1653
+ "step": 102000
1654
+ },
1655
+ {
1656
+ "epoch": 25.0,
1657
+ "eval_accuracy": 0.9311341199133156,
1658
+ "eval_loss": 0.49332907795906067,
1659
+ "eval_runtime": 79.9831,
1660
+ "eval_samples_per_second": 363.464,
1661
+ "eval_steps_per_second": 11.365,
1662
+ "step": 102200
1663
+ }
1664
+ ],
1665
+ "logging_steps": 500,
1666
+ "max_steps": 102200,
1667
+ "num_input_tokens_seen": 0,
1668
+ "num_train_epochs": 25,
1669
+ "save_steps": 500,
1670
+ "stateful_callbacks": {
1671
+ "TrainerControl": {
1672
+ "args": {
1673
+ "should_epoch_stop": false,
1674
+ "should_evaluate": false,
1675
+ "should_log": false,
1676
+ "should_save": true,
1677
+ "should_training_stop": true
1678
+ },
1679
+ "attributes": {}
1680
+ }
1681
+ },
1682
+ "total_flos": 4.33360446554112e+17,
1683
+ "train_batch_size": 64,
1684
+ "trial_name": null,
1685
+ "trial_params": null
1686
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25409054dd23ca427edffb1377223523e10e1ad97209f9a36a5eb35bb09ef8da
3
+ size 5112
vocab.txt ADDED
The diff for this file is too large to render. See raw diff