snizio commited on
Commit
40240a9
·
verified ·
1 Parent(s): 4b19637

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "google/mt5-base",
3
+ "architectures": [
4
+ "MT5ForConditionalGeneration"
5
+ ],
6
+ "classifier_dropout": 0.0,
7
+ "d_ff": 2048,
8
+ "d_kv": 64,
9
+ "d_model": 768,
10
+ "decoder_start_token_id": 0,
11
+ "dense_act_fn": "gelu_new",
12
+ "dropout_rate": 0.1,
13
+ "eos_token_id": 1,
14
+ "feed_forward_proj": "gated-gelu",
15
+ "initializer_factor": 1.0,
16
+ "is_encoder_decoder": true,
17
+ "is_gated_act": true,
18
+ "layer_norm_epsilon": 1e-06,
19
+ "model_type": "mt5",
20
+ "num_decoder_layers": 12,
21
+ "num_heads": 12,
22
+ "num_layers": 12,
23
+ "output_past": true,
24
+ "pad_token_id": 0,
25
+ "relative_attention_max_distance": 128,
26
+ "relative_attention_num_buckets": 32,
27
+ "tie_word_embeddings": false,
28
+ "tokenizer_class": "T5Tokenizer",
29
+ "torch_dtype": "float32",
30
+ "transformers_version": "4.40.2",
31
+ "use_cache": true,
32
+ "vocab_size": 250112
33
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "decoder_start_token_id": 0,
3
+ "eos_token_id": 1,
4
+ "pad_token_id": 0,
5
+ "transformers_version": "4.40.2"
6
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e36e69864ca2dd7ab198529e09bf6ae29f703b9ac678067a4030e4734dca5d1b
3
+ size 2329638768
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4af40543f3f4737138013cd8dfef60ff96f5f0db4b82571e8ff153b5209c382
3
+ size 4115322
rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09dc913f8390f905c4bfee7072c2421292d8f99fba33681bf427cde3f4699045
3
+ size 14512
rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8fa721b6778b7011e9e69e01915c0edb4d3c2abc253bd9d5a7d2d2c20437ad4e
3
+ size 14512
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff142e42ad4bbcf8518b6a439ef30c14478e4470306df3a76115a5239a55c910
3
+ size 1000
special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eos_token": {
3
+ "content": "</s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "pad_token": {
10
+ "content": "<pad>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "unk_token": {
17
+ "content": "<unk>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
spiece.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef78f86560d809067d12bac6c09f19a462cb3af3f54d2b8acbba26e1433125d6
3
+ size 4309802
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ddb1b00035edffde435e24e69011b7f552691405237455cb7c3a59c890682807
3
+ size 16330540
tokenizer_config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": true,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<pad>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "</s>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "<unk>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ }
28
+ },
29
+ "additional_special_tokens": [],
30
+ "clean_up_tokenization_spaces": true,
31
+ "eos_token": "</s>",
32
+ "extra_ids": 0,
33
+ "legacy": true,
34
+ "model_max_length": 1000000000000000019884624838656,
35
+ "pad_token": "<pad>",
36
+ "sp_model_kwargs": {},
37
+ "tokenizer_class": "T5Tokenizer",
38
+ "unk_token": "<unk>"
39
+ }
trainer_state.json ADDED
@@ -0,0 +1,1138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 3.5637874603271484,
3
+ "best_model_checkpoint": "checkpoints/mt5-base/checkpoint-37386",
4
+ "epoch": 13.501625135427952,
5
+ "eval_steps": 2077,
6
+ "global_step": 37386,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.14987360057782592,
13
+ "eval_g2l_cer": 286.0084,
14
+ "eval_g2l_gen_len": 8.1826,
15
+ "eval_g2l_rouge1": 5.5622,
16
+ "eval_g2l_rouge2": 1.1913,
17
+ "eval_g2l_rougeL": 5.4996,
18
+ "eval_g2l_rougeLsum": 5.5114,
19
+ "eval_l2ex_cer": 86.6247,
20
+ "eval_l2ex_gen_len": 7.9383,
21
+ "eval_l2ex_rouge1": 16.8571,
22
+ "eval_l2ex_rouge2": 5.0595,
23
+ "eval_l2ex_rougeL": 15.4605,
24
+ "eval_l2ex_rougeLsum": 15.4778,
25
+ "eval_l2g_cer": 87.9265,
26
+ "eval_l2g_gen_len": 5.5834,
27
+ "eval_l2g_rouge1": 12.3664,
28
+ "eval_l2g_rouge2": 1.6791,
29
+ "eval_l2g_rougeL": 11.57,
30
+ "eval_l2g_rougeLsum": 11.5871,
31
+ "eval_loss": 8.359071731567383,
32
+ "eval_runtime": 145.7254,
33
+ "eval_samples_per_second": 68.08,
34
+ "eval_steps_per_second": 2.134,
35
+ "step": 415
36
+ },
37
+ {
38
+ "epoch": 0.15023474178403756,
39
+ "grad_norm": 1659.9119873046875,
40
+ "learning_rate": 2.0038535645472063e-05,
41
+ "loss": 15.088,
42
+ "step": 416
43
+ },
44
+ {
45
+ "epoch": 0.3004694835680751,
46
+ "grad_norm": 8.369742393493652,
47
+ "learning_rate": 4.0077071290944125e-05,
48
+ "loss": 7.1403,
49
+ "step": 832
50
+ },
51
+ {
52
+ "epoch": 0.4507042253521127,
53
+ "grad_norm": 1.8832136392593384,
54
+ "learning_rate": 6.0115606936416195e-05,
55
+ "loss": 5.3841,
56
+ "step": 1248
57
+ },
58
+ {
59
+ "epoch": 0.6009389671361502,
60
+ "grad_norm": 23.656179428100586,
61
+ "learning_rate": 8.015414258188825e-05,
62
+ "loss": 4.9928,
63
+ "step": 1664
64
+ },
65
+ {
66
+ "epoch": 0.7500902853015529,
67
+ "eval_g2l_cer": 59.4485,
68
+ "eval_g2l_gen_len": 2.9556,
69
+ "eval_g2l_rouge1": 27.1975,
70
+ "eval_g2l_rouge2": 17.2281,
71
+ "eval_g2l_rougeL": 27.1916,
72
+ "eval_g2l_rougeLsum": 27.2117,
73
+ "eval_l2ex_cer": 95.5123,
74
+ "eval_l2ex_gen_len": 23.6742,
75
+ "eval_l2ex_rouge1": 25.7497,
76
+ "eval_l2ex_rouge2": 11.1192,
77
+ "eval_l2ex_rougeL": 22.9461,
78
+ "eval_l2ex_rougeLsum": 22.9441,
79
+ "eval_l2g_cer": 83.2502,
80
+ "eval_l2g_gen_len": 15.9711,
81
+ "eval_l2g_rouge1": 27.2934,
82
+ "eval_l2g_rouge2": 14.8195,
83
+ "eval_l2g_rougeL": 25.9582,
84
+ "eval_l2g_rougeLsum": 25.9617,
85
+ "eval_loss": 4.2178425788879395,
86
+ "eval_runtime": 203.8137,
87
+ "eval_samples_per_second": 48.677,
88
+ "eval_steps_per_second": 1.526,
89
+ "step": 2077
90
+ },
91
+ {
92
+ "epoch": 0.7511737089201878,
93
+ "grad_norm": 1.5714240074157715,
94
+ "learning_rate": 9.999999747638704e-05,
95
+ "loss": 4.7614,
96
+ "step": 2080
97
+ },
98
+ {
99
+ "epoch": 0.9014084507042254,
100
+ "grad_norm": 2.348027467727661,
101
+ "learning_rate": 9.997217976013284e-05,
102
+ "loss": 4.6037,
103
+ "step": 2496
104
+ },
105
+ {
106
+ "epoch": 1.051643192488263,
107
+ "grad_norm": 2.6275577545166016,
108
+ "learning_rate": 9.98898067640237e-05,
109
+ "loss": 4.5136,
110
+ "step": 2912
111
+ },
112
+ {
113
+ "epoch": 1.2018779342723005,
114
+ "grad_norm": 1.1122652292251587,
115
+ "learning_rate": 9.975296886788363e-05,
116
+ "loss": 4.4057,
117
+ "step": 3328
118
+ },
119
+ {
120
+ "epoch": 1.352112676056338,
121
+ "grad_norm": 1.2248876094818115,
122
+ "learning_rate": 9.956181621053908e-05,
123
+ "loss": 4.3513,
124
+ "step": 3744
125
+ },
126
+ {
127
+ "epoch": 1.500180570603106,
128
+ "eval_g2l_cer": 56.9353,
129
+ "eval_g2l_gen_len": 3.093,
130
+ "eval_g2l_rouge1": 32.8366,
131
+ "eval_g2l_rouge2": 24.3793,
132
+ "eval_g2l_rougeL": 32.7964,
133
+ "eval_g2l_rougeLsum": 32.7681,
134
+ "eval_l2ex_cer": 84.2827,
135
+ "eval_l2ex_gen_len": 20.4604,
136
+ "eval_l2ex_rouge1": 28.5353,
137
+ "eval_l2ex_rouge2": 12.5551,
138
+ "eval_l2ex_rougeL": 25.5058,
139
+ "eval_l2ex_rougeLsum": 25.5427,
140
+ "eval_l2g_cer": 81.1104,
141
+ "eval_l2g_gen_len": 18.0072,
142
+ "eval_l2g_rouge1": 32.688,
143
+ "eval_l2g_rouge2": 18.9467,
144
+ "eval_l2g_rougeL": 30.7295,
145
+ "eval_l2g_rougeLsum": 30.7676,
146
+ "eval_loss": 3.949233293533325,
147
+ "eval_runtime": 197.691,
148
+ "eval_samples_per_second": 50.184,
149
+ "eval_steps_per_second": 1.573,
150
+ "step": 4154
151
+ },
152
+ {
153
+ "epoch": 1.5023474178403755,
154
+ "grad_norm": 1.4981008768081665,
155
+ "learning_rate": 9.931655852508637e-05,
156
+ "loss": 4.3061,
157
+ "step": 4160
158
+ },
159
+ {
160
+ "epoch": 1.652582159624413,
161
+ "grad_norm": 1.050997018814087,
162
+ "learning_rate": 9.901746490877203e-05,
163
+ "loss": 4.2525,
164
+ "step": 4576
165
+ },
166
+ {
167
+ "epoch": 1.8028169014084507,
168
+ "grad_norm": 0.8377422094345093,
169
+ "learning_rate": 9.866486352773886e-05,
170
+ "loss": 4.2289,
171
+ "step": 4992
172
+ },
173
+ {
174
+ "epoch": 1.9530516431924883,
175
+ "grad_norm": 0.968815267086029,
176
+ "learning_rate": 9.82591412569612e-05,
177
+ "loss": 4.1958,
178
+ "step": 5408
179
+ },
180
+ {
181
+ "epoch": 2.103286384976526,
182
+ "grad_norm": 0.9952152967453003,
183
+ "learning_rate": 9.780074325576496e-05,
184
+ "loss": 4.1187,
185
+ "step": 5824
186
+ },
187
+ {
188
+ "epoch": 2.2502708559046587,
189
+ "eval_g2l_cer": 53.8806,
190
+ "eval_g2l_gen_len": 3.0548,
191
+ "eval_g2l_rouge1": 35.439,
192
+ "eval_g2l_rouge2": 27.2433,
193
+ "eval_g2l_rougeL": 35.4384,
194
+ "eval_g2l_rougeLsum": 35.3985,
195
+ "eval_l2ex_cer": 89.3431,
196
+ "eval_l2ex_gen_len": 23.4573,
197
+ "eval_l2ex_rouge1": 27.8815,
198
+ "eval_l2ex_rouge2": 12.1568,
199
+ "eval_l2ex_rougeL": 24.5796,
200
+ "eval_l2ex_rougeLsum": 24.6286,
201
+ "eval_l2g_cer": 78.589,
202
+ "eval_l2g_gen_len": 17.4946,
203
+ "eval_l2g_rouge1": 35.6236,
204
+ "eval_l2g_rouge2": 22.8027,
205
+ "eval_l2g_rougeL": 33.8966,
206
+ "eval_l2g_rougeLsum": 33.9001,
207
+ "eval_loss": 3.8344309329986572,
208
+ "eval_runtime": 202.9852,
209
+ "eval_samples_per_second": 48.875,
210
+ "eval_steps_per_second": 1.532,
211
+ "step": 6231
212
+ },
213
+ {
214
+ "epoch": 2.2535211267605635,
215
+ "grad_norm": 24.567110061645508,
216
+ "learning_rate": 9.72901724793979e-05,
217
+ "loss": 4.0993,
218
+ "step": 6240
219
+ },
220
+ {
221
+ "epoch": 2.403755868544601,
222
+ "grad_norm": 0.9726364612579346,
223
+ "learning_rate": 9.672798912718604e-05,
224
+ "loss": 4.0734,
225
+ "step": 6656
226
+ },
227
+ {
228
+ "epoch": 2.5539906103286385,
229
+ "grad_norm": 0.9216151833534241,
230
+ "learning_rate": 9.611481002788184e-05,
231
+ "loss": 4.0584,
232
+ "step": 7072
233
+ },
234
+ {
235
+ "epoch": 2.704225352112676,
236
+ "grad_norm": 0.7880883812904358,
237
+ "learning_rate": 9.545130796287832e-05,
238
+ "loss": 4.0312,
239
+ "step": 7488
240
+ },
241
+ {
242
+ "epoch": 2.8544600938967135,
243
+ "grad_norm": 0.9635422229766846,
244
+ "learning_rate": 9.473821092803199e-05,
245
+ "loss": 4.0046,
246
+ "step": 7904
247
+ },
248
+ {
249
+ "epoch": 3.0003611412062114,
250
+ "eval_g2l_cer": 52.062,
251
+ "eval_g2l_gen_len": 3.0702,
252
+ "eval_g2l_rouge1": 36.8811,
253
+ "eval_g2l_rouge2": 28.8156,
254
+ "eval_g2l_rougeL": 36.8925,
255
+ "eval_g2l_rougeLsum": 36.8317,
256
+ "eval_l2ex_cer": 90.1083,
257
+ "eval_l2ex_gen_len": 22.4645,
258
+ "eval_l2ex_rouge1": 27.5056,
259
+ "eval_l2ex_rouge2": 12.5248,
260
+ "eval_l2ex_rougeL": 24.4085,
261
+ "eval_l2ex_rougeLsum": 24.4463,
262
+ "eval_l2g_cer": 78.1779,
263
+ "eval_l2g_gen_len": 17.8095,
264
+ "eval_l2g_rouge1": 36.8332,
265
+ "eval_l2g_rouge2": 23.9422,
266
+ "eval_l2g_rougeL": 34.9672,
267
+ "eval_l2g_rougeLsum": 34.995,
268
+ "eval_loss": 3.7595808506011963,
269
+ "eval_runtime": 202.3409,
270
+ "eval_samples_per_second": 49.031,
271
+ "eval_steps_per_second": 1.537,
272
+ "step": 8308
273
+ },
274
+ {
275
+ "epoch": 3.004694835680751,
276
+ "grad_norm": 0.900855541229248,
277
+ "learning_rate": 9.397630133490413e-05,
278
+ "loss": 3.992,
279
+ "step": 8320
280
+ },
281
+ {
282
+ "epoch": 3.1549295774647885,
283
+ "grad_norm": 0.8881470561027527,
284
+ "learning_rate": 9.316641515229741e-05,
285
+ "loss": 3.9362,
286
+ "step": 8736
287
+ },
288
+ {
289
+ "epoch": 3.3051643192488265,
290
+ "grad_norm": 0.7969784140586853,
291
+ "learning_rate": 9.230944098902894e-05,
292
+ "loss": 3.9143,
293
+ "step": 9152
294
+ },
295
+ {
296
+ "epoch": 3.455399061032864,
297
+ "grad_norm": 0.8603357672691345,
298
+ "learning_rate": 9.1406319118947e-05,
299
+ "loss": 3.9162,
300
+ "step": 9568
301
+ },
302
+ {
303
+ "epoch": 3.6056338028169015,
304
+ "grad_norm": 0.9974511861801147,
305
+ "learning_rate": 9.045804044926044e-05,
306
+ "loss": 3.8987,
307
+ "step": 9984
308
+ },
309
+ {
310
+ "epoch": 3.7504514265077646,
311
+ "eval_g2l_cer": 50.7917,
312
+ "eval_g2l_gen_len": 3.0031,
313
+ "eval_g2l_rouge1": 37.7135,
314
+ "eval_g2l_rouge2": 29.9526,
315
+ "eval_g2l_rougeL": 37.7649,
316
+ "eval_g2l_rougeLsum": 37.7041,
317
+ "eval_l2ex_cer": 86.8671,
318
+ "eval_l2ex_gen_len": 22.2271,
319
+ "eval_l2ex_rouge1": 28.7692,
320
+ "eval_l2ex_rouge2": 12.8536,
321
+ "eval_l2ex_rougeL": 25.3768,
322
+ "eval_l2ex_rougeLsum": 25.4158,
323
+ "eval_l2g_cer": 73.3411,
324
+ "eval_l2g_gen_len": 15.6692,
325
+ "eval_l2g_rouge1": 37.5152,
326
+ "eval_l2g_rouge2": 24.5536,
327
+ "eval_l2g_rougeL": 35.5225,
328
+ "eval_l2g_rougeLsum": 35.5437,
329
+ "eval_loss": 3.7121169567108154,
330
+ "eval_runtime": 192.3527,
331
+ "eval_samples_per_second": 51.577,
332
+ "eval_steps_per_second": 1.617,
333
+ "step": 10385
334
+ },
335
+ {
336
+ "epoch": 3.755868544600939,
337
+ "grad_norm": 0.8458616733551025,
338
+ "learning_rate": 8.94656454333133e-05,
339
+ "loss": 3.8883,
340
+ "step": 10400
341
+ },
342
+ {
343
+ "epoch": 3.9061032863849765,
344
+ "grad_norm": 3.1263327598571777,
345
+ "learning_rate": 8.843022292899726e-05,
346
+ "loss": 3.8775,
347
+ "step": 10816
348
+ },
349
+ {
350
+ "epoch": 4.056338028169014,
351
+ "grad_norm": 1.013489842414856,
352
+ "learning_rate": 8.735290900405437e-05,
353
+ "loss": 3.8514,
354
+ "step": 11232
355
+ },
356
+ {
357
+ "epoch": 4.206572769953052,
358
+ "grad_norm": 0.9674685001373291,
359
+ "learning_rate": 8.623488568958123e-05,
360
+ "loss": 3.7962,
361
+ "step": 11648
362
+ },
363
+ {
364
+ "epoch": 4.356807511737089,
365
+ "grad_norm": 1.0607421398162842,
366
+ "learning_rate": 8.507737968310197e-05,
367
+ "loss": 3.8043,
368
+ "step": 12064
369
+ },
370
+ {
371
+ "epoch": 4.500541711809317,
372
+ "eval_g2l_cer": 50.088,
373
+ "eval_g2l_gen_len": 3.0488,
374
+ "eval_g2l_rouge1": 38.7702,
375
+ "eval_g2l_rouge2": 30.6004,
376
+ "eval_g2l_rougeL": 38.7959,
377
+ "eval_g2l_rougeLsum": 38.7454,
378
+ "eval_l2ex_cer": 84.5143,
379
+ "eval_l2ex_gen_len": 20.52,
380
+ "eval_l2ex_rouge1": 28.9181,
381
+ "eval_l2ex_rouge2": 13.2853,
382
+ "eval_l2ex_rougeL": 25.6409,
383
+ "eval_l2ex_rougeLsum": 25.6588,
384
+ "eval_l2g_cer": 72.4949,
385
+ "eval_l2g_gen_len": 15.2432,
386
+ "eval_l2g_rouge1": 37.6479,
387
+ "eval_l2g_rouge2": 24.833,
388
+ "eval_l2g_rougeL": 35.7678,
389
+ "eval_l2g_rougeLsum": 35.776,
390
+ "eval_loss": 3.674677848815918,
391
+ "eval_runtime": 190.2532,
392
+ "eval_samples_per_second": 52.146,
393
+ "eval_steps_per_second": 1.635,
394
+ "step": 12462
395
+ },
396
+ {
397
+ "epoch": 4.507042253521127,
398
+ "grad_norm": 0.9242987632751465,
399
+ "learning_rate": 8.388166100263313e-05,
400
+ "loss": 3.804,
401
+ "step": 12480
402
+ },
403
+ {
404
+ "epoch": 4.657276995305164,
405
+ "grad_norm": 0.8233311772346497,
406
+ "learning_rate": 8.264904159321721e-05,
407
+ "loss": 3.7844,
408
+ "step": 12896
409
+ },
410
+ {
411
+ "epoch": 4.807511737089202,
412
+ "grad_norm": 1.918661117553711,
413
+ "learning_rate": 8.138087388745395e-05,
414
+ "loss": 3.7948,
415
+ "step": 13312
416
+ },
417
+ {
418
+ "epoch": 4.957746478873239,
419
+ "grad_norm": 0.8277648091316223,
420
+ "learning_rate": 8.00785493216083e-05,
421
+ "loss": 3.7951,
422
+ "step": 13728
423
+ },
424
+ {
425
+ "epoch": 5.107981220657277,
426
+ "grad_norm": 1.0518523454666138,
427
+ "learning_rate": 7.874349680892367e-05,
428
+ "loss": 3.7423,
429
+ "step": 14144
430
+ },
431
+ {
432
+ "epoch": 5.250631997110871,
433
+ "eval_g2l_cer": 49.743,
434
+ "eval_g2l_gen_len": 3.0201,
435
+ "eval_g2l_rouge1": 38.8263,
436
+ "eval_g2l_rouge2": 31.1673,
437
+ "eval_g2l_rougeL": 38.8286,
438
+ "eval_g2l_rougeLsum": 38.7898,
439
+ "eval_l2ex_cer": 86.565,
440
+ "eval_l2ex_gen_len": 21.7523,
441
+ "eval_l2ex_rouge1": 28.4984,
442
+ "eval_l2ex_rouge2": 13.072,
443
+ "eval_l2ex_rougeL": 25.2667,
444
+ "eval_l2ex_rougeLsum": 25.2757,
445
+ "eval_l2g_cer": 73.2917,
446
+ "eval_l2g_gen_len": 16.0011,
447
+ "eval_l2g_rouge1": 38.0438,
448
+ "eval_l2g_rouge2": 25.3209,
449
+ "eval_l2g_rougeL": 36.1091,
450
+ "eval_l2g_rougeLsum": 36.1243,
451
+ "eval_loss": 3.649608850479126,
452
+ "eval_runtime": 197.4229,
453
+ "eval_samples_per_second": 50.253,
454
+ "eval_steps_per_second": 1.575,
455
+ "step": 14539
456
+ },
457
+ {
458
+ "epoch": 5.258215962441315,
459
+ "grad_norm": 0.8540360331535339,
460
+ "learning_rate": 7.737718117181538e-05,
461
+ "loss": 3.7126,
462
+ "step": 14560
463
+ },
464
+ {
465
+ "epoch": 5.408450704225352,
466
+ "grad_norm": 0.9189392328262329,
467
+ "learning_rate": 7.598110153466441e-05,
468
+ "loss": 3.7223,
469
+ "step": 14976
470
+ },
471
+ {
472
+ "epoch": 5.55868544600939,
473
+ "grad_norm": 0.92618727684021,
474
+ "learning_rate": 7.45567896789749e-05,
475
+ "loss": 3.7139,
476
+ "step": 15392
477
+ },
478
+ {
479
+ "epoch": 5.708920187793427,
480
+ "grad_norm": 0.7882264852523804,
481
+ "learning_rate": 7.310580836270044e-05,
482
+ "loss": 3.7179,
483
+ "step": 15808
484
+ },
485
+ {
486
+ "epoch": 5.859154929577465,
487
+ "grad_norm": 0.8529959321022034,
488
+ "learning_rate": 7.162974960558259e-05,
489
+ "loss": 3.7121,
490
+ "step": 16224
491
+ },
492
+ {
493
+ "epoch": 6.000722282412423,
494
+ "eval_g2l_cer": 49.3934,
495
+ "eval_g2l_gen_len": 3.0096,
496
+ "eval_g2l_rouge1": 39.4408,
497
+ "eval_g2l_rouge2": 31.7057,
498
+ "eval_g2l_rougeL": 39.4639,
499
+ "eval_g2l_rougeLsum": 39.4161,
500
+ "eval_l2ex_cer": 86.119,
501
+ "eval_l2ex_gen_len": 20.7112,
502
+ "eval_l2ex_rouge1": 28.8739,
503
+ "eval_l2ex_rouge2": 13.2661,
504
+ "eval_l2ex_rougeL": 25.7042,
505
+ "eval_l2ex_rougeLsum": 25.7118,
506
+ "eval_l2g_cer": 73.625,
507
+ "eval_l2g_gen_len": 15.9897,
508
+ "eval_l2g_rouge1": 38.1171,
509
+ "eval_l2g_rouge2": 25.6405,
510
+ "eval_l2g_rougeL": 36.2592,
511
+ "eval_l2g_rougeLsum": 36.2666,
512
+ "eval_loss": 3.6273715496063232,
513
+ "eval_runtime": 193.9276,
514
+ "eval_samples_per_second": 51.158,
515
+ "eval_steps_per_second": 1.604,
516
+ "step": 16616
517
+ },
518
+ {
519
+ "epoch": 6.009389671361502,
520
+ "grad_norm": 0.7976297736167908,
521
+ "learning_rate": 7.013023294238368e-05,
522
+ "loss": 3.7191,
523
+ "step": 16640
524
+ },
525
+ {
526
+ "epoch": 6.15962441314554,
527
+ "grad_norm": 0.8516309261322021,
528
+ "learning_rate": 6.860890364592963e-05,
529
+ "loss": 3.6428,
530
+ "step": 17056
531
+ },
532
+ {
533
+ "epoch": 6.309859154929577,
534
+ "grad_norm": 0.9273515343666077,
535
+ "learning_rate": 6.706743092191335e-05,
536
+ "loss": 3.6566,
537
+ "step": 17472
538
+ },
539
+ {
540
+ "epoch": 6.460093896713615,
541
+ "grad_norm": 0.932829737663269,
542
+ "learning_rate": 6.550750607743873e-05,
543
+ "loss": 3.6627,
544
+ "step": 17888
545
+ },
546
+ {
547
+ "epoch": 6.610328638497653,
548
+ "grad_norm": 0.9968202114105225,
549
+ "learning_rate": 6.393084066531485e-05,
550
+ "loss": 3.6652,
551
+ "step": 18304
552
+ },
553
+ {
554
+ "epoch": 6.750812567713976,
555
+ "eval_g2l_cer": 49.5579,
556
+ "eval_g2l_gen_len": 2.9938,
557
+ "eval_g2l_rouge1": 39.6581,
558
+ "eval_g2l_rouge2": 32.026,
559
+ "eval_g2l_rougeL": 39.6932,
560
+ "eval_g2l_rougeLsum": 39.6518,
561
+ "eval_l2ex_cer": 88.4427,
562
+ "eval_l2ex_gen_len": 23.11,
563
+ "eval_l2ex_rouge1": 28.1485,
564
+ "eval_l2ex_rouge2": 12.4558,
565
+ "eval_l2ex_rougeL": 24.9414,
566
+ "eval_l2ex_rougeLsum": 24.9605,
567
+ "eval_l2g_cer": 73.3296,
568
+ "eval_l2g_gen_len": 16.3263,
569
+ "eval_l2g_rouge1": 38.4506,
570
+ "eval_l2g_rouge2": 25.7696,
571
+ "eval_l2g_rougeL": 36.5748,
572
+ "eval_l2g_rougeLsum": 36.6091,
573
+ "eval_loss": 3.6120047569274902,
574
+ "eval_runtime": 197.9501,
575
+ "eval_samples_per_second": 50.119,
576
+ "eval_steps_per_second": 1.571,
577
+ "step": 18693
578
+ },
579
+ {
580
+ "epoch": 6.76056338028169,
581
+ "grad_norm": 0.7791869640350342,
582
+ "learning_rate": 6.233916460613673e-05,
583
+ "loss": 3.6614,
584
+ "step": 18720
585
+ },
586
+ {
587
+ "epoch": 6.910798122065728,
588
+ "grad_norm": 0.9385781288146973,
589
+ "learning_rate": 6.0734224290212784e-05,
590
+ "loss": 3.6471,
591
+ "step": 19136
592
+ },
593
+ {
594
+ "epoch": 7.061032863849765,
595
+ "grad_norm": 0.8267916440963745,
596
+ "learning_rate": 5.9117780661421754e-05,
597
+ "loss": 3.6264,
598
+ "step": 19552
599
+ },
600
+ {
601
+ "epoch": 7.211267605633803,
602
+ "grad_norm": 0.794131875038147,
603
+ "learning_rate": 5.7491607285101345e-05,
604
+ "loss": 3.6015,
605
+ "step": 19968
606
+ },
607
+ {
608
+ "epoch": 7.36150234741784,
609
+ "grad_norm": 0.8748852610588074,
610
+ "learning_rate": 5.585748840208869e-05,
611
+ "loss": 3.5993,
612
+ "step": 20384
613
+ },
614
+ {
615
+ "epoch": 7.500902853015529,
616
+ "eval_g2l_cer": 50.088,
617
+ "eval_g2l_gen_len": 3.0582,
618
+ "eval_g2l_rouge1": 39.9874,
619
+ "eval_g2l_rouge2": 32.4432,
620
+ "eval_g2l_rougeL": 40.0195,
621
+ "eval_g2l_rougeLsum": 39.9365,
622
+ "eval_l2ex_cer": 87.6165,
623
+ "eval_l2ex_gen_len": 22.7133,
624
+ "eval_l2ex_rouge1": 28.1937,
625
+ "eval_l2ex_rouge2": 12.5673,
626
+ "eval_l2ex_rougeL": 24.9397,
627
+ "eval_l2ex_rougeLsum": 24.921,
628
+ "eval_l2g_cer": 72.7284,
629
+ "eval_l2g_gen_len": 15.6759,
630
+ "eval_l2g_rouge1": 38.4813,
631
+ "eval_l2g_rouge2": 25.936,
632
+ "eval_l2g_rougeL": 36.5693,
633
+ "eval_l2g_rougeLsum": 36.5729,
634
+ "eval_loss": 3.6013987064361572,
635
+ "eval_runtime": 195.438,
636
+ "eval_samples_per_second": 50.763,
637
+ "eval_steps_per_second": 1.591,
638
+ "step": 20770
639
+ },
640
+ {
641
+ "epoch": 7.511737089201878,
642
+ "grad_norm": 0.9019631743431091,
643
+ "learning_rate": 5.4217216971047445e-05,
644
+ "loss": 3.5978,
645
+ "step": 20800
646
+ },
647
+ {
648
+ "epoch": 7.661971830985916,
649
+ "grad_norm": 0.8872570395469666,
650
+ "learning_rate": 5.257259270122993e-05,
651
+ "loss": 3.6113,
652
+ "step": 21216
653
+ },
654
+ {
655
+ "epoch": 7.812206572769953,
656
+ "grad_norm": 0.7394893169403076,
657
+ "learning_rate": 5.0925420077832285e-05,
658
+ "loss": 3.593,
659
+ "step": 21632
660
+ },
661
+ {
662
+ "epoch": 7.962441314553991,
663
+ "grad_norm": 0.8534842133522034,
664
+ "learning_rate": 4.927750638210947e-05,
665
+ "loss": 3.5963,
666
+ "step": 22048
667
+ },
668
+ {
669
+ "epoch": 8.112676056338028,
670
+ "grad_norm": 0.9047814607620239,
671
+ "learning_rate": 4.7630659708422666e-05,
672
+ "loss": 3.5722,
673
+ "step": 22464
674
+ },
675
+ {
676
+ "epoch": 8.250993138317082,
677
+ "eval_g2l_cer": 49.5716,
678
+ "eval_g2l_gen_len": 3.0388,
679
+ "eval_g2l_rouge1": 40.4088,
680
+ "eval_g2l_rouge2": 32.7272,
681
+ "eval_g2l_rougeL": 40.4374,
682
+ "eval_g2l_rougeLsum": 40.3677,
683
+ "eval_l2ex_cer": 83.5858,
684
+ "eval_l2ex_gen_len": 20.4851,
685
+ "eval_l2ex_rouge1": 29.084,
686
+ "eval_l2ex_rouge2": 12.9208,
687
+ "eval_l2ex_rougeL": 25.6832,
688
+ "eval_l2ex_rougeLsum": 25.7033,
689
+ "eval_l2g_cer": 72.1741,
690
+ "eval_l2g_gen_len": 15.6461,
691
+ "eval_l2g_rouge1": 38.8628,
692
+ "eval_l2g_rouge2": 26.1912,
693
+ "eval_l2g_rougeL": 36.9072,
694
+ "eval_l2g_rougeLsum": 36.9086,
695
+ "eval_loss": 3.5901942253112793,
696
+ "eval_runtime": 190.412,
697
+ "eval_samples_per_second": 52.103,
698
+ "eval_steps_per_second": 1.633,
699
+ "step": 22847
700
+ },
701
+ {
702
+ "epoch": 8.262910798122066,
703
+ "grad_norm": 0.8366677761077881,
704
+ "learning_rate": 4.598668698039414e-05,
705
+ "loss": 3.5641,
706
+ "step": 22880
707
+ },
708
+ {
709
+ "epoch": 8.413145539906104,
710
+ "grad_norm": 0.8628195524215698,
711
+ "learning_rate": 4.4347391968347015e-05,
712
+ "loss": 3.5702,
713
+ "step": 23296
714
+ },
715
+ {
716
+ "epoch": 8.56338028169014,
717
+ "grad_norm": 0.9060849547386169,
718
+ "learning_rate": 4.27145733102046e-05,
719
+ "loss": 3.5508,
720
+ "step": 23712
721
+ },
722
+ {
723
+ "epoch": 8.713615023474178,
724
+ "grad_norm": 0.8726539015769958,
725
+ "learning_rate": 4.109002253802116e-05,
726
+ "loss": 3.5637,
727
+ "step": 24128
728
+ },
729
+ {
730
+ "epoch": 8.863849765258216,
731
+ "grad_norm": 0.9154978394508362,
732
+ "learning_rate": 3.947552211230913e-05,
733
+ "loss": 3.5435,
734
+ "step": 24544
735
+ },
736
+ {
737
+ "epoch": 9.001083423618635,
738
+ "eval_g2l_cer": 48.6326,
739
+ "eval_g2l_gen_len": 3.008,
740
+ "eval_g2l_rouge1": 40.6427,
741
+ "eval_g2l_rouge2": 33.0447,
742
+ "eval_g2l_rougeL": 40.6651,
743
+ "eval_g2l_rougeLsum": 40.6197,
744
+ "eval_l2ex_cer": 85.6816,
745
+ "eval_l2ex_gen_len": 20.9753,
746
+ "eval_l2ex_rouge1": 28.5827,
747
+ "eval_l2ex_rouge2": 12.8213,
748
+ "eval_l2ex_rougeL": 25.352,
749
+ "eval_l2ex_rougeLsum": 25.3642,
750
+ "eval_l2g_cer": 72.7802,
751
+ "eval_l2g_gen_len": 15.8102,
752
+ "eval_l2g_rouge1": 38.814,
753
+ "eval_l2g_rouge2": 26.1373,
754
+ "eval_l2g_rougeL": 36.8943,
755
+ "eval_l2g_rougeLsum": 36.9272,
756
+ "eval_loss": 3.5814104080200195,
757
+ "eval_runtime": 193.5202,
758
+ "eval_samples_per_second": 51.266,
759
+ "eval_steps_per_second": 1.607,
760
+ "step": 24924
761
+ },
762
+ {
763
+ "epoch": 9.014084507042254,
764
+ "grad_norm": 0.9910312294960022,
765
+ "learning_rate": 3.7872843466319744e-05,
766
+ "loss": 3.5601,
767
+ "step": 24960
768
+ },
769
+ {
770
+ "epoch": 9.164319248826292,
771
+ "grad_norm": 0.913223922252655,
772
+ "learning_rate": 3.6283745062422726e-05,
773
+ "loss": 3.5156,
774
+ "step": 25376
775
+ },
776
+ {
777
+ "epoch": 9.314553990610328,
778
+ "grad_norm": 0.9026065468788147,
779
+ "learning_rate": 3.470997046271774e-05,
780
+ "loss": 3.5337,
781
+ "step": 25792
782
+ },
783
+ {
784
+ "epoch": 9.464788732394366,
785
+ "grad_norm": 0.9726517796516418,
786
+ "learning_rate": 3.315324641599434e-05,
787
+ "loss": 3.5294,
788
+ "step": 26208
789
+ },
790
+ {
791
+ "epoch": 9.615023474178404,
792
+ "grad_norm": 0.954593300819397,
793
+ "learning_rate": 3.161528096313964e-05,
794
+ "loss": 3.5242,
795
+ "step": 26624
796
+ },
797
+ {
798
+ "epoch": 9.751173708920188,
799
+ "eval_g2l_cer": 48.3196,
800
+ "eval_g2l_gen_len": 3.0196,
801
+ "eval_g2l_rouge1": 41.1733,
802
+ "eval_g2l_rouge2": 33.4761,
803
+ "eval_g2l_rougeL": 41.172,
804
+ "eval_g2l_rougeLsum": 41.1111,
805
+ "eval_l2ex_cer": 86.3469,
806
+ "eval_l2ex_gen_len": 21.333,
807
+ "eval_l2ex_rouge1": 28.6196,
808
+ "eval_l2ex_rouge2": 12.797,
809
+ "eval_l2ex_rougeL": 25.331,
810
+ "eval_l2ex_rougeLsum": 25.3251,
811
+ "eval_l2g_cer": 71.8519,
812
+ "eval_l2g_gen_len": 15.5771,
813
+ "eval_l2g_rouge1": 38.9877,
814
+ "eval_l2g_rouge2": 26.3016,
815
+ "eval_l2g_rougeL": 36.97,
816
+ "eval_l2g_rougeLsum": 37.0109,
817
+ "eval_loss": 3.5751187801361084,
818
+ "eval_runtime": 190.5769,
819
+ "eval_samples_per_second": 52.058,
820
+ "eval_steps_per_second": 1.632,
821
+ "step": 27001
822
+ },
823
+ {
824
+ "epoch": 9.765258215962442,
825
+ "grad_norm": 0.7817335724830627,
826
+ "learning_rate": 3.00977615630722e-05,
827
+ "loss": 3.5332,
828
+ "step": 27040
829
+ },
830
+ {
831
+ "epoch": 9.915492957746478,
832
+ "grad_norm": 0.8576836585998535,
833
+ "learning_rate": 2.8602353241258667e-05,
834
+ "loss": 3.5247,
835
+ "step": 27456
836
+ },
837
+ {
838
+ "epoch": 10.065727699530516,
839
+ "grad_norm": 0.924045741558075,
840
+ "learning_rate": 2.7130696762844198e-05,
841
+ "loss": 3.5171,
842
+ "step": 27872
843
+ },
844
+ {
845
+ "epoch": 10.215962441314554,
846
+ "grad_norm": 0.9701129198074341,
847
+ "learning_rate": 2.568440683240166e-05,
848
+ "loss": 3.4886,
849
+ "step": 28288
850
+ },
851
+ {
852
+ "epoch": 10.366197183098592,
853
+ "grad_norm": 0.8473976850509644,
854
+ "learning_rate": 2.426507032227427e-05,
855
+ "loss": 3.5134,
856
+ "step": 28704
857
+ },
858
+ {
859
+ "epoch": 10.501263994221741,
860
+ "eval_g2l_cer": 48.8336,
861
+ "eval_g2l_gen_len": 3.0502,
862
+ "eval_g2l_rouge1": 41.0241,
863
+ "eval_g2l_rouge2": 33.2994,
864
+ "eval_g2l_rougeL": 41.0374,
865
+ "eval_g2l_rougeLsum": 40.9554,
866
+ "eval_l2ex_cer": 85.2795,
867
+ "eval_l2ex_gen_len": 21.6999,
868
+ "eval_l2ex_rouge1": 28.6576,
869
+ "eval_l2ex_rouge2": 12.5848,
870
+ "eval_l2ex_rougeL": 25.1057,
871
+ "eval_l2ex_rougeLsum": 25.1478,
872
+ "eval_l2g_cer": 71.5555,
873
+ "eval_l2g_gen_len": 15.5923,
874
+ "eval_l2g_rouge1": 39.111,
875
+ "eval_l2g_rouge2": 26.3632,
876
+ "eval_l2g_rougeL": 37.134,
877
+ "eval_l2g_rougeLsum": 37.1562,
878
+ "eval_loss": 3.5716097354888916,
879
+ "eval_runtime": 190.1354,
880
+ "eval_samples_per_second": 52.179,
881
+ "eval_steps_per_second": 1.636,
882
+ "step": 29078
883
+ },
884
+ {
885
+ "epoch": 10.51643192488263,
886
+ "grad_norm": 0.9222161769866943,
887
+ "learning_rate": 2.2874244531456016e-05,
888
+ "loss": 3.4995,
889
+ "step": 29120
890
+ },
891
+ {
892
+ "epoch": 10.666666666666666,
893
+ "grad_norm": 0.8834406137466431,
894
+ "learning_rate": 2.1513455476919875e-05,
895
+ "loss": 3.5005,
896
+ "step": 29536
897
+ },
898
+ {
899
+ "epoch": 10.816901408450704,
900
+ "grad_norm": 1.2534151077270508,
901
+ "learning_rate": 2.0184196219268805e-05,
902
+ "loss": 3.4956,
903
+ "step": 29952
904
+ },
905
+ {
906
+ "epoch": 10.967136150234742,
907
+ "grad_norm": 1.0579476356506348,
908
+ "learning_rate": 1.8887925224546575e-05,
909
+ "loss": 3.4984,
910
+ "step": 30368
911
+ },
912
+ {
913
+ "epoch": 11.11737089201878,
914
+ "grad_norm": 0.9352797269821167,
915
+ "learning_rate": 1.7626064764005655e-05,
916
+ "loss": 3.4891,
917
+ "step": 30784
918
+ },
919
+ {
920
+ "epoch": 11.251354279523294,
921
+ "eval_g2l_cer": 48.1779,
922
+ "eval_g2l_gen_len": 3.0241,
923
+ "eval_g2l_rouge1": 41.3076,
924
+ "eval_g2l_rouge2": 33.5874,
925
+ "eval_g2l_rougeL": 41.3381,
926
+ "eval_g2l_rougeLsum": 41.2834,
927
+ "eval_l2ex_cer": 86.303,
928
+ "eval_l2ex_gen_len": 21.6927,
929
+ "eval_l2ex_rouge1": 28.5306,
930
+ "eval_l2ex_rouge2": 12.66,
931
+ "eval_l2ex_rougeL": 25.107,
932
+ "eval_l2ex_rougeLsum": 25.1229,
933
+ "eval_l2g_cer": 71.7607,
934
+ "eval_l2g_gen_len": 15.6002,
935
+ "eval_l2g_rouge1": 39.1998,
936
+ "eval_l2g_rouge2": 26.5146,
937
+ "eval_l2g_rougeL": 37.2299,
938
+ "eval_l2g_rougeLsum": 37.2583,
939
+ "eval_loss": 3.5692920684814453,
940
+ "eval_runtime": 191.2935,
941
+ "eval_samples_per_second": 51.863,
942
+ "eval_steps_per_second": 1.626,
943
+ "step": 31155
944
+ },
945
+ {
946
+ "epoch": 11.267605633802816,
947
+ "grad_norm": 0.8403520584106445,
948
+ "learning_rate": 1.6399999353588347e-05,
949
+ "loss": 3.4762,
950
+ "step": 31200
951
+ },
952
+ {
953
+ "epoch": 11.417840375586854,
954
+ "grad_norm": 0.8685266375541687,
955
+ "learning_rate": 1.5211074234832911e-05,
956
+ "loss": 3.491,
957
+ "step": 31616
958
+ },
959
+ {
960
+ "epoch": 11.568075117370892,
961
+ "grad_norm": 0.8662200570106506,
962
+ "learning_rate": 1.4060593898871712e-05,
963
+ "loss": 3.4818,
964
+ "step": 32032
965
+ },
966
+ {
967
+ "epoch": 11.71830985915493,
968
+ "grad_norm": 0.915972888469696,
969
+ "learning_rate": 1.2949820655140888e-05,
970
+ "loss": 3.4729,
971
+ "step": 32448
972
+ },
973
+ {
974
+ "epoch": 11.868544600938968,
975
+ "grad_norm": 0.9427916407585144,
976
+ "learning_rate": 1.187997324637174e-05,
977
+ "loss": 3.4837,
978
+ "step": 32864
979
+ },
980
+ {
981
+ "epoch": 12.001444564824846,
982
+ "eval_g2l_cer": 48.4635,
983
+ "eval_g2l_gen_len": 3.0374,
984
+ "eval_g2l_rouge1": 41.42,
985
+ "eval_g2l_rouge2": 33.7871,
986
+ "eval_g2l_rougeL": 41.41,
987
+ "eval_g2l_rougeLsum": 41.3653,
988
+ "eval_l2ex_cer": 84.6873,
989
+ "eval_l2ex_gen_len": 21.5406,
990
+ "eval_l2ex_rouge1": 28.7533,
991
+ "eval_l2ex_rouge2": 12.7721,
992
+ "eval_l2ex_rougeL": 25.3715,
993
+ "eval_l2ex_rougeLsum": 25.3817,
994
+ "eval_l2g_cer": 71.4847,
995
+ "eval_l2g_gen_len": 15.5437,
996
+ "eval_l2g_rouge1": 39.2147,
997
+ "eval_l2g_rouge2": 26.5099,
998
+ "eval_l2g_rougeL": 37.2362,
999
+ "eval_l2g_rougeLsum": 37.2641,
1000
+ "eval_loss": 3.5653076171875,
1001
+ "eval_runtime": 189.8727,
1002
+ "eval_samples_per_second": 52.251,
1003
+ "eval_steps_per_second": 1.638,
1004
+ "step": 33232
1005
+ },
1006
+ {
1007
+ "epoch": 12.018779342723004,
1008
+ "grad_norm": 0.8259687423706055,
1009
+ "learning_rate": 1.0852225511383663e-05,
1010
+ "loss": 3.4764,
1011
+ "step": 33280
1012
+ },
1013
+ {
1014
+ "epoch": 12.169014084507042,
1015
+ "grad_norm": 0.904097855091095,
1016
+ "learning_rate": 9.86770509714574e-06,
1017
+ "loss": 3.4791,
1018
+ "step": 33696
1019
+ },
1020
+ {
1021
+ "epoch": 12.31924882629108,
1022
+ "grad_norm": 0.9662612080574036,
1023
+ "learning_rate": 8.927492221520133e-06,
1024
+ "loss": 3.4593,
1025
+ "step": 34112
1026
+ },
1027
+ {
1028
+ "epoch": 12.469483568075118,
1029
+ "grad_norm": 0.9324942231178284,
1030
+ "learning_rate": 8.032618488044715e-06,
1031
+ "loss": 3.4564,
1032
+ "step": 34528
1033
+ },
1034
+ {
1035
+ "epoch": 12.619718309859154,
1036
+ "grad_norm": 0.9966897964477539,
1037
+ "learning_rate": 7.184065754055608e-06,
1038
+ "loss": 3.4576,
1039
+ "step": 34944
1040
+ },
1041
+ {
1042
+ "epoch": 12.751534850126399,
1043
+ "eval_g2l_cer": 47.8718,
1044
+ "eval_g2l_gen_len": 3.0243,
1045
+ "eval_g2l_rouge1": 41.399,
1046
+ "eval_g2l_rouge2": 33.8189,
1047
+ "eval_g2l_rougeL": 41.4105,
1048
+ "eval_g2l_rougeLsum": 41.3515,
1049
+ "eval_l2ex_cer": 84.0524,
1050
+ "eval_l2ex_gen_len": 21.0206,
1051
+ "eval_l2ex_rouge1": 28.7814,
1052
+ "eval_l2ex_rouge2": 12.7663,
1053
+ "eval_l2ex_rougeL": 25.3724,
1054
+ "eval_l2ex_rougeLsum": 25.3895,
1055
+ "eval_l2g_cer": 71.6622,
1056
+ "eval_l2g_gen_len": 15.563,
1057
+ "eval_l2g_rouge1": 39.1666,
1058
+ "eval_l2g_rouge2": 26.5275,
1059
+ "eval_l2g_rougeL": 37.1881,
1060
+ "eval_l2g_rougeLsum": 37.2249,
1061
+ "eval_loss": 3.564103841781616,
1062
+ "eval_runtime": 190.2806,
1063
+ "eval_samples_per_second": 52.139,
1064
+ "eval_steps_per_second": 1.634,
1065
+ "step": 35309
1066
+ },
1067
+ {
1068
+ "epoch": 12.769953051643192,
1069
+ "grad_norm": 1.0099953413009644,
1070
+ "learning_rate": 6.382765053391182e-06,
1071
+ "loss": 3.4757,
1072
+ "step": 35360
1073
+ },
1074
+ {
1075
+ "epoch": 12.92018779342723,
1076
+ "grad_norm": 0.8347458243370056,
1077
+ "learning_rate": 5.629595574859816e-06,
1078
+ "loss": 3.4814,
1079
+ "step": 35776
1080
+ },
1081
+ {
1082
+ "epoch": 13.070422535211268,
1083
+ "grad_norm": 0.8532468676567078,
1084
+ "learning_rate": 4.925383697592043e-06,
1085
+ "loss": 3.4667,
1086
+ "step": 36192
1087
+ },
1088
+ {
1089
+ "epoch": 13.220657276995306,
1090
+ "grad_norm": 0.8852038383483887,
1091
+ "learning_rate": 4.2709020843357075e-06,
1092
+ "loss": 3.4512,
1093
+ "step": 36608
1094
+ },
1095
+ {
1096
+ "epoch": 13.370892018779342,
1097
+ "grad_norm": 1.058424472808838,
1098
+ "learning_rate": 3.666868833688726e-06,
1099
+ "loss": 3.4616,
1100
+ "step": 37024
1101
+ },
1102
+ {
1103
+ "epoch": 13.501625135427952,
1104
+ "eval_g2l_cer": 47.8581,
1105
+ "eval_g2l_gen_len": 3.0221,
1106
+ "eval_g2l_rouge1": 41.4693,
1107
+ "eval_g2l_rouge2": 33.7773,
1108
+ "eval_g2l_rougeL": 41.4822,
1109
+ "eval_g2l_rougeLsum": 41.4356,
1110
+ "eval_l2ex_cer": 84.3083,
1111
+ "eval_l2ex_gen_len": 21.0319,
1112
+ "eval_l2ex_rouge1": 28.654,
1113
+ "eval_l2ex_rouge2": 12.8413,
1114
+ "eval_l2ex_rougeL": 25.3941,
1115
+ "eval_l2ex_rougeLsum": 25.4326,
1116
+ "eval_l2g_cer": 71.0018,
1117
+ "eval_l2g_gen_len": 15.3407,
1118
+ "eval_l2g_rouge1": 39.2009,
1119
+ "eval_l2g_rouge2": 26.5422,
1120
+ "eval_l2g_rougeL": 37.2433,
1121
+ "eval_l2g_rougeLsum": 37.2693,
1122
+ "eval_loss": 3.5637874603271484,
1123
+ "eval_runtime": 187.7571,
1124
+ "eval_samples_per_second": 52.84,
1125
+ "eval_steps_per_second": 1.656,
1126
+ "step": 37386
1127
+ }
1128
+ ],
1129
+ "logging_steps": 416,
1130
+ "max_steps": 41535,
1131
+ "num_input_tokens_seen": 0,
1132
+ "num_train_epochs": 15,
1133
+ "save_steps": 2077,
1134
+ "total_flos": 7.17240637379838e+17,
1135
+ "train_batch_size": 16,
1136
+ "trial_name": null,
1137
+ "trial_params": null
1138
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d7cd326d002fd124e7b2f63ea7ced82382a90306e419db899b4d3a0d83186bf
3
+ size 5176