samahadhoud commited on
Commit
3e314f1
·
verified ·
1 Parent(s): 2653dd7

Add step-10000 checkpoint

Browse files
config.json ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "PIXELSumModel"
4
+ ],
5
+ "decoder": {
6
+ "_name_or_path": "gpt2",
7
+ "activation_function": "gelu_new",
8
+ "add_cross_attention": true,
9
+ "architectures": [
10
+ "GPT2LMHeadModel"
11
+ ],
12
+ "attn_pdrop": 0.1,
13
+ "bad_words_ids": null,
14
+ "bos_token_id": 50256,
15
+ "chunk_size_feed_forward": 0,
16
+ "cross_attention_hidden_size": null,
17
+ "cross_attention_reduce_factor": 1,
18
+ "decoder_start_token_id": null,
19
+ "diversity_penalty": 0.0,
20
+ "do_sample": false,
21
+ "early_stopping": false,
22
+ "embd_pdrop": 0.1,
23
+ "encoder_hidden_size": 768,
24
+ "encoder_no_repeat_ngram_size": 0,
25
+ "eos_token_id": 50256,
26
+ "exponential_decay_length_penalty": null,
27
+ "finetuning_task": null,
28
+ "forced_bos_token_id": null,
29
+ "forced_eos_token_id": null,
30
+ "id2label": {
31
+ "0": "LABEL_0",
32
+ "1": "LABEL_1"
33
+ },
34
+ "initializer_range": 0.02,
35
+ "is_decoder": true,
36
+ "is_encoder_decoder": false,
37
+ "label2id": {
38
+ "LABEL_0": 0,
39
+ "LABEL_1": 1
40
+ },
41
+ "layer_norm_epsilon": 1e-05,
42
+ "length_penalty": 1.0,
43
+ "max_length": 20,
44
+ "min_length": 0,
45
+ "model_type": "gpt2",
46
+ "n_ctx": 1024,
47
+ "n_embd": 768,
48
+ "n_head": 12,
49
+ "n_inner": null,
50
+ "n_layer": 12,
51
+ "n_positions": 1024,
52
+ "no_repeat_ngram_size": 0,
53
+ "num_beam_groups": 1,
54
+ "num_beams": 1,
55
+ "num_return_sequences": 1,
56
+ "output_attentions": false,
57
+ "output_hidden_states": false,
58
+ "output_scores": false,
59
+ "pad_token_id": null,
60
+ "prefix": null,
61
+ "problem_type": null,
62
+ "pruned_heads": {},
63
+ "remove_invalid_values": false,
64
+ "reorder_and_upcast_attn": false,
65
+ "repetition_penalty": 1.0,
66
+ "resid_pdrop": 0.1,
67
+ "return_dict": true,
68
+ "return_dict_in_generate": false,
69
+ "scale_attn_by_inverse_layer_idx": false,
70
+ "scale_attn_weights": true,
71
+ "sep_token_id": null,
72
+ "summary_activation": null,
73
+ "summary_first_dropout": 0.1,
74
+ "summary_proj_to_labels": true,
75
+ "summary_type": "cls_index",
76
+ "summary_use_proj": true,
77
+ "task_specific_params": {
78
+ "text-generation": {
79
+ "do_sample": true,
80
+ "max_length": 64
81
+ }
82
+ },
83
+ "temperature": 1.0,
84
+ "tie_encoder_decoder": false,
85
+ "tie_word_embeddings": true,
86
+ "tokenizer_class": null,
87
+ "top_k": 50,
88
+ "top_p": 1.0,
89
+ "torch_dtype": null,
90
+ "torchscript": false,
91
+ "transformers_version": "4.19.0",
92
+ "typical_p": 1.0,
93
+ "use_bfloat16": false,
94
+ "use_cache": true,
95
+ "vocab_size": 50257
96
+ },
97
+ "decoder_start_token_id": 50256,
98
+ "encoder": {
99
+ "_name_or_path": "/workspace/PixelSum/experiments/pixelsum_m4",
100
+ "add_cross_attention": false,
101
+ "architectures": [
102
+ "PIXELForPreTraining"
103
+ ],
104
+ "attention_probs_dropout_prob": 0.1,
105
+ "bad_words_ids": null,
106
+ "bos_token_id": null,
107
+ "chunk_size_feed_forward": 0,
108
+ "cross_attention_hidden_size": null,
109
+ "decoder_hidden_size": 512,
110
+ "decoder_intermediate_size": 2048,
111
+ "decoder_num_attention_heads": 16,
112
+ "decoder_num_hidden_layers": 8,
113
+ "decoder_start_token_id": null,
114
+ "diversity_penalty": 0.0,
115
+ "do_eval": true,
116
+ "do_sample": false,
117
+ "early_stopping": false,
118
+ "encoder_no_repeat_ngram_size": 0,
119
+ "eos_token_id": null,
120
+ "exponential_decay_length_penalty": null,
121
+ "finetuning_task": null,
122
+ "forced_bos_token_id": null,
123
+ "forced_eos_token_id": null,
124
+ "hidden_act": "gelu",
125
+ "hidden_dropout_prob": 0.1,
126
+ "hidden_size": 768,
127
+ "id2label": {
128
+ "0": "LABEL_0",
129
+ "1": "LABEL_1"
130
+ },
131
+ "image_size": [
132
+ 16,
133
+ 8464
134
+ ],
135
+ "initializer_range": 0.02,
136
+ "intermediate_size": 3072,
137
+ "is_decoder": false,
138
+ "is_encoder_decoder": false,
139
+ "label2id": {
140
+ "LABEL_0": 0,
141
+ "LABEL_1": 1
142
+ },
143
+ "layer_norm_eps": 1e-12,
144
+ "length_penalty": 1.0,
145
+ "mask_ratio": 0.25,
146
+ "max_length": 20,
147
+ "min_length": 0,
148
+ "model_type": "pixel",
149
+ "no_repeat_ngram_size": 0,
150
+ "norm_pix_loss": true,
151
+ "num_attention_heads": 12,
152
+ "num_beam_groups": 1,
153
+ "num_beams": 1,
154
+ "num_channels": 3,
155
+ "num_hidden_layers": 12,
156
+ "num_return_sequences": 1,
157
+ "output_attentions": false,
158
+ "output_hidden_states": false,
159
+ "output_scores": false,
160
+ "pad_token_id": null,
161
+ "patch_size": 16,
162
+ "prefix": null,
163
+ "problem_type": null,
164
+ "pruned_heads": {},
165
+ "qkv_bias": true,
166
+ "remove_invalid_values": false,
167
+ "repetition_penalty": 1.0,
168
+ "return_dict": true,
169
+ "return_dict_in_generate": false,
170
+ "sep_token_id": null,
171
+ "task_specific_params": null,
172
+ "temperature": 1.0,
173
+ "tie_encoder_decoder": false,
174
+ "tie_word_embeddings": true,
175
+ "tokenizer_class": null,
176
+ "top_k": 50,
177
+ "top_p": 1.0,
178
+ "torch_dtype": "float32",
179
+ "torchscript": false,
180
+ "transformers_version": "4.19.0",
181
+ "typical_p": 1.0,
182
+ "use_bfloat16": false
183
+ },
184
+ "eos_token_id": 50256,
185
+ "is_encoder_decoder": true,
186
+ "model_type": "pixelsum",
187
+ "pad_token_id": 50256,
188
+ "tie_word_embeddings": false,
189
+ "torch_dtype": "float32",
190
+ "transformers_version": null,
191
+ "vocab_size": 50257
192
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f74a4955aab51f51f5f35575cf3426424d6e21d5a6765f9cec291034281ee2a
3
+ size 980777906
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
text_renderer_config.json ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "background_color": "white",
3
+ "dpi": 120,
4
+ "font_color": "black",
5
+ "font_file": "49e6dc219d1a1a1c9236acaf05a48b542002016a6dc877ee72baab085a84257b.3f28e7f4b38e1efe1b6da4a3732404c19d4c6a614ff32dce90a251e293d4ce58",
6
+ "font_size": 8,
7
+ "fonts_list": [
8
+ "Apple Color Emoji",
9
+ "DejaVu Sans",
10
+ "DejaVu Sans Mono",
11
+ "DejaVu Serif",
12
+ "Go Noto Current",
13
+ "Inconsolata",
14
+ "Monospace",
15
+ "Noto Fangsong KSS Rotated",
16
+ "Noto Fangsong KSS Vertical",
17
+ "Noto Kufi Arabic",
18
+ "Noto Music",
19
+ "Noto Naskh Arabic",
20
+ "Noto Naskh Arabic UI",
21
+ "Noto Nastaliq Urdu",
22
+ "Noto Rashi Hebrew",
23
+ "Noto Sans",
24
+ "Noto Sans Adlam",
25
+ "Noto Sans Adlam Unjoined",
26
+ "Noto Sans Anatolian Hieroglyphs",
27
+ "Noto Sans Arabic",
28
+ "Noto Sans Arabic UI",
29
+ "Noto Sans Armenian",
30
+ "Noto Sans Avestan",
31
+ "Noto Sans Balinese",
32
+ "Noto Sans Bamum",
33
+ "Noto Sans Bassa Vah",
34
+ "Noto Sans Batak",
35
+ "Noto Sans Bhaiksuki",
36
+ "Noto Sans Brahmi",
37
+ "Noto Sans Buginese",
38
+ "Noto Sans Buhid",
39
+ "Noto Sans CJK HK",
40
+ "Noto Sans CJK JP",
41
+ "Noto Sans CJK KR",
42
+ "Noto Sans CJK SC",
43
+ "Noto Sans CJK TC",
44
+ "Noto Sans Canadian Aboriginal",
45
+ "Noto Sans Carian",
46
+ "Noto Sans Caucasian Albanian",
47
+ "Noto Sans Chakma",
48
+ "Noto Sans Cham",
49
+ "Noto Sans Cherokee",
50
+ "Noto Sans Chorasmian",
51
+ "Noto Sans Coptic",
52
+ "Noto Sans Cuneiform",
53
+ "Noto Sans Cypriot",
54
+ "Noto Sans Cypro Minoan",
55
+ "Noto Sans Deseret",
56
+ "Noto Sans Devanagari",
57
+ "Noto Sans Devanagari UI",
58
+ "Noto Sans Duployan",
59
+ "Noto Sans Egyptian Hieroglyphs",
60
+ "Noto Sans Elbasan",
61
+ "Noto Sans Elymaic",
62
+ "Noto Sans Ethiopic",
63
+ "Noto Sans Georgian",
64
+ "Noto Sans Glagolitic",
65
+ "Noto Sans Gothic",
66
+ "Noto Sans Grantha",
67
+ "Noto Sans Gujarati",
68
+ "Noto Sans Gujarati UI",
69
+ "Noto Sans Gunjala Gondi",
70
+ "Noto Sans Gurmukhi",
71
+ "Noto Sans Gurmukhi UI",
72
+ "Noto Sans Hanifi Rohingya",
73
+ "Noto Sans Hanunoo",
74
+ "Noto Sans Hatran",
75
+ "Noto Sans Hebrew",
76
+ "Noto Sans Hebrew Droid",
77
+ "Noto Sans Imperial Aramaic",
78
+ "Noto Sans Indic Siyaq Numbers",
79
+ "Noto Sans Inscriptional Pahlavi",
80
+ "Noto Sans Inscriptional Parthian",
81
+ "Noto Sans Javanese",
82
+ "Noto Sans Kaithi",
83
+ "Noto Sans Kannada",
84
+ "Noto Sans Kannada UI",
85
+ "Noto Sans Kawi",
86
+ "Noto Sans Kayah Li",
87
+ "Noto Sans Kharoshthi",
88
+ "Noto Sans Khmer",
89
+ "Noto Sans Khmer UI",
90
+ "Noto Sans Khudawadi",
91
+ "Noto Sans Lao",
92
+ "Noto Sans Lao Looped",
93
+ "Noto Sans Lao Looped UI",
94
+ "Noto Sans Lao UI",
95
+ "Noto Sans Lepcha",
96
+ "Noto Sans Limbu",
97
+ "Noto Sans Linear A",
98
+ "Noto Sans Linear B",
99
+ "Noto Sans Lisu",
100
+ "Noto Sans Lycian",
101
+ "Noto Sans Lydian",
102
+ "Noto Sans Mahajani",
103
+ "Noto Sans Malayalam",
104
+ "Noto Sans Malayalam UI",
105
+ "Noto Sans Mandaic",
106
+ "Noto Sans Manichaean",
107
+ "Noto Sans Marchen",
108
+ "Noto Sans Masaram Gondi",
109
+ "Noto Sans Math",
110
+ "Noto Sans Mayan Numerals",
111
+ "Noto Sans Medefaidrin",
112
+ "Noto Sans Meetei Mayek",
113
+ "Noto Sans Mende Kikakui",
114
+ "Noto Sans Meroitic",
115
+ "Noto Sans Miao",
116
+ "Noto Sans Modi",
117
+ "Noto Sans Mongolian",
118
+ "Noto Sans Mono",
119
+ "Noto Sans Mro",
120
+ "Noto Sans Multani",
121
+ "Noto Sans Myanmar",
122
+ "Noto Sans Myanmar UI",
123
+ "Noto Sans NKo",
124
+ "Noto Sans NKo Unjoined",
125
+ "Noto Sans Nabataean",
126
+ "Noto Sans Nag Mundari",
127
+ "Noto Sans Nandinagari",
128
+ "Noto Sans New Tai Lue",
129
+ "Noto Sans Newa",
130
+ "Noto Sans Nushu",
131
+ "Noto Sans Ogham",
132
+ "Noto Sans Ol Chiki",
133
+ "Noto Sans Old Hungarian",
134
+ "Noto Sans Old Italic",
135
+ "Noto Sans Old North Arabian",
136
+ "Noto Sans Old Permic",
137
+ "Noto Sans Old Persian",
138
+ "Noto Sans Old Sogdian",
139
+ "Noto Sans Old South Arabian",
140
+ "Noto Sans Old Turkic",
141
+ "Noto Sans Oriya",
142
+ "Noto Sans Osage",
143
+ "Noto Sans Osmanya",
144
+ "Noto Sans Pahawh Hmong",
145
+ "Noto Sans Palmyrene",
146
+ "Noto Sans Pau Cin Hau",
147
+ "Noto Sans PhagsPa",
148
+ "Noto Sans Phoenician",
149
+ "Noto Sans Psalter Pahlavi",
150
+ "Noto Sans Rejang",
151
+ "Noto Sans Runic",
152
+ "Noto Sans Samaritan",
153
+ "Noto Sans Saurashtra",
154
+ "Noto Sans Sharada",
155
+ "Noto Sans Shavian",
156
+ "Noto Sans Siddham",
157
+ "Noto Sans SignWriting",
158
+ "Noto Sans Sinhala",
159
+ "Noto Sans Sinhala UI",
160
+ "Noto Sans Sogdian",
161
+ "Noto Sans Sora Sompeng",
162
+ "Noto Sans Soyombo",
163
+ "Noto Sans Sundanese",
164
+ "Noto Sans Sunuwar",
165
+ "Noto Sans Syloti Nagri",
166
+ "Noto Sans Symbols",
167
+ "Noto Sans Symbols 2",
168
+ "Noto Sans Syriac",
169
+ "Noto Sans Syriac Eastern",
170
+ "Noto Sans Syriac Western",
171
+ "Noto Sans Tagalog",
172
+ "Noto Sans Tagbanwa",
173
+ "Noto Sans Tai Le",
174
+ "Noto Sans Tai Tham",
175
+ "Noto Sans Tai Viet",
176
+ "Noto Sans Takri",
177
+ "Noto Sans Tamil",
178
+ "Noto Sans Tamil Supplement",
179
+ "Noto Sans Tamil UI",
180
+ "Noto Sans Tangsa",
181
+ "Noto Sans Telugu",
182
+ "Noto Sans Telugu UI",
183
+ "Noto Sans Test",
184
+ "Noto Sans Thaana",
185
+ "Noto Sans Thai",
186
+ "Noto Sans Thai Looped",
187
+ "Noto Sans Thai Looped UI",
188
+ "Noto Sans Thai UI",
189
+ "Noto Sans Tifinagh",
190
+ "Noto Sans Tifinagh APT",
191
+ "Noto Sans Tifinagh Adrar",
192
+ "Noto Sans Tifinagh Agraw Imazighen",
193
+ "Noto Sans Tifinagh Ahaggar",
194
+ "Noto Sans Tifinagh Air",
195
+ "Noto Sans Tifinagh Azawagh",
196
+ "Noto Sans Tifinagh Ghat",
197
+ "Noto Sans Tifinagh Hawad",
198
+ "Noto Sans Tifinagh Rhissa Ixa",
199
+ "Noto Sans Tifinagh SIL",
200
+ "Noto Sans Tifinagh Tawellemmet",
201
+ "Noto Sans Tirhuta",
202
+ "Noto Sans Ugaritic",
203
+ "Noto Sans Vai",
204
+ "Noto Sans Vithkuqi",
205
+ "Noto Sans Wancho",
206
+ "Noto Sans Warang Citi",
207
+ "Noto Sans Yi",
208
+ "Noto Sans Zanabazar Square",
209
+ "Noto Traditional Nushu",
210
+ "Noto Znamenny Musical Notation",
211
+ "NotoSansOldHungarianUI",
212
+ "Sans",
213
+ "Serif",
214
+ "Source Code Pro",
215
+ "System-ui",
216
+ "Ubuntu",
217
+ "Ubuntu Condensed",
218
+ "Ubuntu Mono"
219
+ ],
220
+ "max_seq_length": 529,
221
+ "pad_size": 3,
222
+ "pixels_per_patch": 16,
223
+ "rgb": false,
224
+ "text_renderer_type": "PangoCairoTextRenderer"
225
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "bert-base-uncased", "tokenizer_class": "BertTokenizer"}
trainer_state.json ADDED
@@ -0,0 +1,1672 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 1.8625396953625901,
3
+ "best_model_checkpoint": "experiments/translation/gpt2/2025-06-23_21-13-47/checkpoint-12000",
4
+ "epoch": 9.69263370332997,
5
+ "global_step": 12000,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.04,
12
+ "learning_rate": 4.8e-06,
13
+ "loss": 5.5374,
14
+ "step": 50
15
+ },
16
+ {
17
+ "epoch": 0.08,
18
+ "learning_rate": 9.799999999999998e-06,
19
+ "loss": 4.9125,
20
+ "step": 100
21
+ },
22
+ {
23
+ "epoch": 0.12,
24
+ "learning_rate": 1.4799999999999999e-05,
25
+ "loss": 4.7963,
26
+ "step": 150
27
+ },
28
+ {
29
+ "epoch": 0.16,
30
+ "learning_rate": 1.98e-05,
31
+ "loss": 4.7123,
32
+ "step": 200
33
+ },
34
+ {
35
+ "epoch": 0.2,
36
+ "learning_rate": 2.4799999999999996e-05,
37
+ "loss": 4.6695,
38
+ "step": 250
39
+ },
40
+ {
41
+ "epoch": 0.24,
42
+ "learning_rate": 2.9799999999999996e-05,
43
+ "loss": 4.6409,
44
+ "step": 300
45
+ },
46
+ {
47
+ "epoch": 0.28,
48
+ "learning_rate": 3.48e-05,
49
+ "loss": 4.5901,
50
+ "step": 350
51
+ },
52
+ {
53
+ "epoch": 0.32,
54
+ "learning_rate": 3.979999999999999e-05,
55
+ "loss": 4.5638,
56
+ "step": 400
57
+ },
58
+ {
59
+ "epoch": 0.36,
60
+ "learning_rate": 4.48e-05,
61
+ "loss": 4.5254,
62
+ "step": 450
63
+ },
64
+ {
65
+ "epoch": 0.4,
66
+ "learning_rate": 4.98e-05,
67
+ "loss": 4.4895,
68
+ "step": 500
69
+ },
70
+ {
71
+ "epoch": 0.4,
72
+ "eval_bleu": 0.10366108848804567,
73
+ "eval_loss": 4.428106784820557,
74
+ "eval_runtime": 1956.2148,
75
+ "eval_samples_per_second": 4.503,
76
+ "eval_steps_per_second": 0.282,
77
+ "step": 500
78
+ },
79
+ {
80
+ "epoch": 0.44,
81
+ "learning_rate": 5.48e-05,
82
+ "loss": 4.5073,
83
+ "step": 550
84
+ },
85
+ {
86
+ "epoch": 0.48,
87
+ "learning_rate": 5.98e-05,
88
+ "loss": 4.4717,
89
+ "step": 600
90
+ },
91
+ {
92
+ "epoch": 0.52,
93
+ "learning_rate": 6.479999999999999e-05,
94
+ "loss": 4.4849,
95
+ "step": 650
96
+ },
97
+ {
98
+ "epoch": 0.57,
99
+ "learning_rate": 6.979999999999999e-05,
100
+ "loss": 4.4378,
101
+ "step": 700
102
+ },
103
+ {
104
+ "epoch": 0.61,
105
+ "learning_rate": 7.479999999999999e-05,
106
+ "loss": 4.4473,
107
+ "step": 750
108
+ },
109
+ {
110
+ "epoch": 0.65,
111
+ "learning_rate": 7.98e-05,
112
+ "loss": 4.4403,
113
+ "step": 800
114
+ },
115
+ {
116
+ "epoch": 0.69,
117
+ "learning_rate": 8.48e-05,
118
+ "loss": 4.4189,
119
+ "step": 850
120
+ },
121
+ {
122
+ "epoch": 0.73,
123
+ "learning_rate": 8.98e-05,
124
+ "loss": 4.3842,
125
+ "step": 900
126
+ },
127
+ {
128
+ "epoch": 0.77,
129
+ "learning_rate": 9.479999999999999e-05,
130
+ "loss": 4.3661,
131
+ "step": 950
132
+ },
133
+ {
134
+ "epoch": 0.81,
135
+ "learning_rate": 9.979999999999999e-05,
136
+ "loss": 4.3978,
137
+ "step": 1000
138
+ },
139
+ {
140
+ "epoch": 0.81,
141
+ "eval_bleu": 0.12101650019262844,
142
+ "eval_loss": 4.30416202545166,
143
+ "eval_runtime": 1952.2431,
144
+ "eval_samples_per_second": 4.512,
145
+ "eval_steps_per_second": 0.282,
146
+ "step": 1000
147
+ },
148
+ {
149
+ "epoch": 0.85,
150
+ "learning_rate": 0.00010479999999999999,
151
+ "loss": 4.353,
152
+ "step": 1050
153
+ },
154
+ {
155
+ "epoch": 0.89,
156
+ "learning_rate": 0.00010979999999999999,
157
+ "loss": 4.3321,
158
+ "step": 1100
159
+ },
160
+ {
161
+ "epoch": 0.93,
162
+ "learning_rate": 0.00011479999999999999,
163
+ "loss": 4.3258,
164
+ "step": 1150
165
+ },
166
+ {
167
+ "epoch": 0.97,
168
+ "learning_rate": 0.00011979999999999998,
169
+ "loss": 4.3221,
170
+ "step": 1200
171
+ },
172
+ {
173
+ "epoch": 1.01,
174
+ "learning_rate": 0.00012479999999999997,
175
+ "loss": 4.3282,
176
+ "step": 1250
177
+ },
178
+ {
179
+ "epoch": 1.05,
180
+ "learning_rate": 0.00012979999999999998,
181
+ "loss": 4.1992,
182
+ "step": 1300
183
+ },
184
+ {
185
+ "epoch": 1.09,
186
+ "learning_rate": 0.00013479999999999997,
187
+ "loss": 4.1949,
188
+ "step": 1350
189
+ },
190
+ {
191
+ "epoch": 1.13,
192
+ "learning_rate": 0.00013979999999999998,
193
+ "loss": 4.1812,
194
+ "step": 1400
195
+ },
196
+ {
197
+ "epoch": 1.17,
198
+ "learning_rate": 0.0001448,
199
+ "loss": 4.1842,
200
+ "step": 1450
201
+ },
202
+ {
203
+ "epoch": 1.21,
204
+ "learning_rate": 0.00014979999999999998,
205
+ "loss": 4.1744,
206
+ "step": 1500
207
+ },
208
+ {
209
+ "epoch": 1.21,
210
+ "eval_bleu": 0.19483428061504493,
211
+ "eval_loss": 4.20745849609375,
212
+ "eval_runtime": 1957.9666,
213
+ "eval_samples_per_second": 4.499,
214
+ "eval_steps_per_second": 0.281,
215
+ "step": 1500
216
+ },
217
+ {
218
+ "epoch": 1.25,
219
+ "learning_rate": 0.00014999895016087306,
220
+ "loss": 4.1746,
221
+ "step": 1550
222
+ },
223
+ {
224
+ "epoch": 1.29,
225
+ "learning_rate": 0.00014999562388001478,
226
+ "loss": 4.134,
227
+ "step": 1600
228
+ },
229
+ {
230
+ "epoch": 1.33,
231
+ "learning_rate": 0.00014999001943582685,
232
+ "loss": 4.1534,
233
+ "step": 1650
234
+ },
235
+ {
236
+ "epoch": 1.37,
237
+ "learning_rate": 0.0001499821369985571,
238
+ "loss": 4.1424,
239
+ "step": 1700
240
+ },
241
+ {
242
+ "epoch": 1.41,
243
+ "learning_rate": 0.0001499719768076526,
244
+ "loss": 4.1222,
245
+ "step": 1750
246
+ },
247
+ {
248
+ "epoch": 1.45,
249
+ "learning_rate": 0.00014995953917175227,
250
+ "loss": 4.1216,
251
+ "step": 1800
252
+ },
253
+ {
254
+ "epoch": 1.49,
255
+ "learning_rate": 0.00014994482446867774,
256
+ "loss": 4.0985,
257
+ "step": 1850
258
+ },
259
+ {
260
+ "epoch": 1.53,
261
+ "learning_rate": 0.00014992783314542174,
262
+ "loss": 4.0785,
263
+ "step": 1900
264
+ },
265
+ {
266
+ "epoch": 1.57,
267
+ "learning_rate": 0.00014990856571813448,
268
+ "loss": 4.1087,
269
+ "step": 1950
270
+ },
271
+ {
272
+ "epoch": 1.62,
273
+ "learning_rate": 0.0001498870227721081,
274
+ "loss": 4.0548,
275
+ "step": 2000
276
+ },
277
+ {
278
+ "epoch": 1.62,
279
+ "eval_bleu": 0.3099045639043874,
280
+ "eval_loss": 4.0605082511901855,
281
+ "eval_runtime": 1963.1271,
282
+ "eval_samples_per_second": 4.487,
283
+ "eval_steps_per_second": 0.281,
284
+ "step": 2000
285
+ },
286
+ {
287
+ "epoch": 1.66,
288
+ "learning_rate": 0.0001498632049617587,
289
+ "loss": 4.0445,
290
+ "step": 2050
291
+ },
292
+ {
293
+ "epoch": 1.7,
294
+ "learning_rate": 0.00014983711301060673,
295
+ "loss": 4.0045,
296
+ "step": 2100
297
+ },
298
+ {
299
+ "epoch": 1.74,
300
+ "learning_rate": 0.00014980874771125478,
301
+ "loss": 4.0004,
302
+ "step": 2150
303
+ },
304
+ {
305
+ "epoch": 1.78,
306
+ "learning_rate": 0.00014977810992536359,
307
+ "loss": 4.0131,
308
+ "step": 2200
309
+ },
310
+ {
311
+ "epoch": 1.82,
312
+ "learning_rate": 0.00014974520058362584,
313
+ "loss": 4.0018,
314
+ "step": 2250
315
+ },
316
+ {
317
+ "epoch": 1.86,
318
+ "learning_rate": 0.00014971002068573793,
319
+ "loss": 3.9797,
320
+ "step": 2300
321
+ },
322
+ {
323
+ "epoch": 1.9,
324
+ "learning_rate": 0.00014967257130036961,
325
+ "loss": 3.9479,
326
+ "step": 2350
327
+ },
328
+ {
329
+ "epoch": 1.94,
330
+ "learning_rate": 0.00014963285356513152,
331
+ "loss": 3.9432,
332
+ "step": 2400
333
+ },
334
+ {
335
+ "epoch": 1.98,
336
+ "learning_rate": 0.0001495908686865405,
337
+ "loss": 3.9119,
338
+ "step": 2450
339
+ },
340
+ {
341
+ "epoch": 2.02,
342
+ "learning_rate": 0.00014954661793998317,
343
+ "loss": 3.8522,
344
+ "step": 2500
345
+ },
346
+ {
347
+ "epoch": 2.02,
348
+ "eval_bleu": 0.41817761915800616,
349
+ "eval_loss": 3.9211950302124023,
350
+ "eval_runtime": 1966.167,
351
+ "eval_samples_per_second": 4.48,
352
+ "eval_steps_per_second": 0.28,
353
+ "step": 2500
354
+ },
355
+ {
356
+ "epoch": 2.06,
357
+ "learning_rate": 0.0001495001026696769,
358
+ "loss": 3.6675,
359
+ "step": 2550
360
+ },
361
+ {
362
+ "epoch": 2.1,
363
+ "learning_rate": 0.00014945132428862936,
364
+ "loss": 3.6431,
365
+ "step": 2600
366
+ },
367
+ {
368
+ "epoch": 2.14,
369
+ "learning_rate": 0.00014940028427859524,
370
+ "loss": 3.6479,
371
+ "step": 2650
372
+ },
373
+ {
374
+ "epoch": 2.18,
375
+ "learning_rate": 0.00014934698419003133,
376
+ "loss": 3.6214,
377
+ "step": 2700
378
+ },
379
+ {
380
+ "epoch": 2.22,
381
+ "learning_rate": 0.0001492914256420496,
382
+ "loss": 3.6333,
383
+ "step": 2750
384
+ },
385
+ {
386
+ "epoch": 2.26,
387
+ "learning_rate": 0.00014923361032236776,
388
+ "loss": 3.6243,
389
+ "step": 2800
390
+ },
391
+ {
392
+ "epoch": 2.3,
393
+ "learning_rate": 0.00014917353998725823,
394
+ "loss": 3.6292,
395
+ "step": 2850
396
+ },
397
+ {
398
+ "epoch": 2.34,
399
+ "learning_rate": 0.00014911121646149456,
400
+ "loss": 3.5965,
401
+ "step": 2900
402
+ },
403
+ {
404
+ "epoch": 2.38,
405
+ "learning_rate": 0.00014904664163829616,
406
+ "loss": 3.5995,
407
+ "step": 2950
408
+ },
409
+ {
410
+ "epoch": 2.42,
411
+ "learning_rate": 0.00014897981747927076,
412
+ "loss": 3.5954,
413
+ "step": 3000
414
+ },
415
+ {
416
+ "epoch": 2.42,
417
+ "eval_bleu": 0.6197998214597401,
418
+ "eval_loss": 3.80145525932312,
419
+ "eval_runtime": 1964.8057,
420
+ "eval_samples_per_second": 4.483,
421
+ "eval_steps_per_second": 0.28,
422
+ "step": 3000
423
+ },
424
+ {
425
+ "epoch": 2.46,
426
+ "learning_rate": 0.00014891074601435482,
427
+ "loss": 3.5926,
428
+ "step": 3050
429
+ },
430
+ {
431
+ "epoch": 2.5,
432
+ "learning_rate": 0.00014883942934175177,
433
+ "loss": 3.5782,
434
+ "step": 3100
435
+ },
436
+ {
437
+ "epoch": 2.54,
438
+ "learning_rate": 0.0001487658696278684,
439
+ "loss": 3.5606,
440
+ "step": 3150
441
+ },
442
+ {
443
+ "epoch": 2.58,
444
+ "learning_rate": 0.000148690069107249,
445
+ "loss": 3.5623,
446
+ "step": 3200
447
+ },
448
+ {
449
+ "epoch": 2.62,
450
+ "learning_rate": 0.00014861203008250745,
451
+ "loss": 3.5484,
452
+ "step": 3250
453
+ },
454
+ {
455
+ "epoch": 2.67,
456
+ "learning_rate": 0.0001485317549242574,
457
+ "loss": 3.574,
458
+ "step": 3300
459
+ },
460
+ {
461
+ "epoch": 2.71,
462
+ "learning_rate": 0.00014844924607104,
463
+ "loss": 3.5422,
464
+ "step": 3350
465
+ },
466
+ {
467
+ "epoch": 2.75,
468
+ "learning_rate": 0.00014836450602925014,
469
+ "loss": 3.5185,
470
+ "step": 3400
471
+ },
472
+ {
473
+ "epoch": 2.79,
474
+ "learning_rate": 0.00014827753737306008,
475
+ "loss": 3.5329,
476
+ "step": 3450
477
+ },
478
+ {
479
+ "epoch": 2.83,
480
+ "learning_rate": 0.00014818834274434134,
481
+ "loss": 3.5305,
482
+ "step": 3500
483
+ },
484
+ {
485
+ "epoch": 2.83,
486
+ "eval_bleu": 0.9915510687453413,
487
+ "eval_loss": 3.6937255859375,
488
+ "eval_runtime": 1965.6982,
489
+ "eval_samples_per_second": 4.481,
490
+ "eval_steps_per_second": 0.28,
491
+ "step": 3500
492
+ },
493
+ {
494
+ "epoch": 2.87,
495
+ "learning_rate": 0.00014809692485258445,
496
+ "loss": 3.4934,
497
+ "step": 3550
498
+ },
499
+ {
500
+ "epoch": 2.91,
501
+ "learning_rate": 0.00014800328647481662,
502
+ "loss": 3.5064,
503
+ "step": 3600
504
+ },
505
+ {
506
+ "epoch": 2.95,
507
+ "learning_rate": 0.00014790743045551744,
508
+ "loss": 3.475,
509
+ "step": 3650
510
+ },
511
+ {
512
+ "epoch": 2.99,
513
+ "learning_rate": 0.00014780935970653235,
514
+ "loss": 3.4644,
515
+ "step": 3700
516
+ },
517
+ {
518
+ "epoch": 3.03,
519
+ "learning_rate": 0.00014770907720698426,
520
+ "loss": 3.3422,
521
+ "step": 3750
522
+ },
523
+ {
524
+ "epoch": 3.07,
525
+ "learning_rate": 0.00014760658600318318,
526
+ "loss": 3.2076,
527
+ "step": 3800
528
+ },
529
+ {
530
+ "epoch": 3.11,
531
+ "learning_rate": 0.00014750188920853338,
532
+ "loss": 3.2166,
533
+ "step": 3850
534
+ },
535
+ {
536
+ "epoch": 3.15,
537
+ "learning_rate": 0.00014739499000343914,
538
+ "loss": 3.214,
539
+ "step": 3900
540
+ },
541
+ {
542
+ "epoch": 3.19,
543
+ "learning_rate": 0.0001472858916352079,
544
+ "loss": 3.2104,
545
+ "step": 3950
546
+ },
547
+ {
548
+ "epoch": 3.23,
549
+ "learning_rate": 0.0001471745974179517,
550
+ "loss": 3.175,
551
+ "step": 4000
552
+ },
553
+ {
554
+ "epoch": 3.23,
555
+ "eval_bleu": 0.8229569245913304,
556
+ "eval_loss": 3.6438491344451904,
557
+ "eval_runtime": 1966.943,
558
+ "eval_samples_per_second": 4.478,
559
+ "eval_steps_per_second": 0.28,
560
+ "step": 4000
561
+ },
562
+ {
563
+ "epoch": 3.27,
564
+ "learning_rate": 0.00014706111073248656,
565
+ "loss": 3.2015,
566
+ "step": 4050
567
+ },
568
+ {
569
+ "epoch": 3.31,
570
+ "learning_rate": 0.0001469454350262297,
571
+ "loss": 3.2109,
572
+ "step": 4100
573
+ },
574
+ {
575
+ "epoch": 3.35,
576
+ "learning_rate": 0.0001468275738130948,
577
+ "loss": 3.2042,
578
+ "step": 4150
579
+ },
580
+ {
581
+ "epoch": 3.39,
582
+ "learning_rate": 0.0001467075306733854,
583
+ "loss": 3.1996,
584
+ "step": 4200
585
+ },
586
+ {
587
+ "epoch": 3.43,
588
+ "learning_rate": 0.000146585309253686,
589
+ "loss": 3.206,
590
+ "step": 4250
591
+ },
592
+ {
593
+ "epoch": 3.47,
594
+ "learning_rate": 0.00014646091326675126,
595
+ "loss": 3.1749,
596
+ "step": 4300
597
+ },
598
+ {
599
+ "epoch": 3.51,
600
+ "learning_rate": 0.00014633434649139344,
601
+ "loss": 3.2049,
602
+ "step": 4350
603
+ },
604
+ {
605
+ "epoch": 3.55,
606
+ "learning_rate": 0.00014620561277236722,
607
+ "loss": 3.1852,
608
+ "step": 4400
609
+ },
610
+ {
611
+ "epoch": 3.59,
612
+ "learning_rate": 0.0001460747160202534,
613
+ "loss": 3.1768,
614
+ "step": 4450
615
+ },
616
+ {
617
+ "epoch": 3.63,
618
+ "learning_rate": 0.0001459416602113397,
619
+ "loss": 3.1863,
620
+ "step": 4500
621
+ },
622
+ {
623
+ "epoch": 3.63,
624
+ "eval_bleu": 1.0657169333169068,
625
+ "eval_loss": 3.550638437271118,
626
+ "eval_runtime": 1967.2607,
627
+ "eval_samples_per_second": 4.477,
628
+ "eval_steps_per_second": 0.28,
629
+ "step": 4500
630
+ },
631
+ {
632
+ "epoch": 3.67,
633
+ "learning_rate": 0.00014580644938750012,
634
+ "loss": 3.1681,
635
+ "step": 4550
636
+ },
637
+ {
638
+ "epoch": 3.72,
639
+ "learning_rate": 0.00014566908765607222,
640
+ "loss": 3.1675,
641
+ "step": 4600
642
+ },
643
+ {
644
+ "epoch": 3.76,
645
+ "learning_rate": 0.00014552957918973226,
646
+ "loss": 3.1601,
647
+ "step": 4650
648
+ },
649
+ {
650
+ "epoch": 3.8,
651
+ "learning_rate": 0.00014538792822636849,
652
+ "loss": 3.154,
653
+ "step": 4700
654
+ },
655
+ {
656
+ "epoch": 3.84,
657
+ "learning_rate": 0.00014524413906895234,
658
+ "loss": 3.1548,
659
+ "step": 4750
660
+ },
661
+ {
662
+ "epoch": 3.88,
663
+ "learning_rate": 0.00014509821608540784,
664
+ "loss": 3.14,
665
+ "step": 4800
666
+ },
667
+ {
668
+ "epoch": 3.92,
669
+ "learning_rate": 0.00014495016370847882,
670
+ "loss": 3.1584,
671
+ "step": 4850
672
+ },
673
+ {
674
+ "epoch": 3.96,
675
+ "learning_rate": 0.00014479998643559435,
676
+ "loss": 3.1351,
677
+ "step": 4900
678
+ },
679
+ {
680
+ "epoch": 4.0,
681
+ "learning_rate": 0.00014464768882873198,
682
+ "loss": 3.1546,
683
+ "step": 4950
684
+ },
685
+ {
686
+ "epoch": 4.04,
687
+ "learning_rate": 0.00014449327551427935,
688
+ "loss": 2.9153,
689
+ "step": 5000
690
+ },
691
+ {
692
+ "epoch": 4.04,
693
+ "eval_bleu": 1.0047431556971733,
694
+ "eval_loss": 3.5359809398651123,
695
+ "eval_runtime": 1966.6362,
696
+ "eval_samples_per_second": 4.479,
697
+ "eval_steps_per_second": 0.28,
698
+ "step": 5000
699
+ },
700
+ {
701
+ "epoch": 4.08,
702
+ "learning_rate": 0.0001443367511828934,
703
+ "loss": 2.8766,
704
+ "step": 5050
705
+ },
706
+ {
707
+ "epoch": 4.12,
708
+ "learning_rate": 0.0001441781205893582,
709
+ "loss": 2.8748,
710
+ "step": 5100
711
+ },
712
+ {
713
+ "epoch": 4.16,
714
+ "learning_rate": 0.00014401738855244028,
715
+ "loss": 2.8439,
716
+ "step": 5150
717
+ },
718
+ {
719
+ "epoch": 4.2,
720
+ "learning_rate": 0.00014385455995474222,
721
+ "loss": 2.8731,
722
+ "step": 5200
723
+ },
724
+ {
725
+ "epoch": 4.24,
726
+ "learning_rate": 0.00014368963974255454,
727
+ "loss": 2.8727,
728
+ "step": 5250
729
+ },
730
+ {
731
+ "epoch": 4.28,
732
+ "learning_rate": 0.0001435226329257053,
733
+ "loss": 2.8911,
734
+ "step": 5300
735
+ },
736
+ {
737
+ "epoch": 4.32,
738
+ "learning_rate": 0.00014335354457740792,
739
+ "loss": 2.8639,
740
+ "step": 5350
741
+ },
742
+ {
743
+ "epoch": 4.36,
744
+ "learning_rate": 0.00014318237983410706,
745
+ "loss": 2.8785,
746
+ "step": 5400
747
+ },
748
+ {
749
+ "epoch": 4.4,
750
+ "learning_rate": 0.0001430091438953227,
751
+ "loss": 2.8755,
752
+ "step": 5450
753
+ },
754
+ {
755
+ "epoch": 4.44,
756
+ "learning_rate": 0.00014283384202349203,
757
+ "loss": 2.8661,
758
+ "step": 5500
759
+ },
760
+ {
761
+ "epoch": 4.44,
762
+ "eval_bleu": 1.207406468713491,
763
+ "eval_loss": 3.5010602474212646,
764
+ "eval_runtime": 1964.1467,
765
+ "eval_samples_per_second": 4.484,
766
+ "eval_steps_per_second": 0.281,
767
+ "step": 5500
768
+ },
769
+ {
770
+ "epoch": 4.48,
771
+ "learning_rate": 0.00014265647954380976,
772
+ "loss": 2.894,
773
+ "step": 5550
774
+ },
775
+ {
776
+ "epoch": 4.52,
777
+ "learning_rate": 0.00014247706184406618,
778
+ "loss": 2.8723,
779
+ "step": 5600
780
+ },
781
+ {
782
+ "epoch": 4.56,
783
+ "learning_rate": 0.00014229559437448362,
784
+ "loss": 2.868,
785
+ "step": 5650
786
+ },
787
+ {
788
+ "epoch": 4.6,
789
+ "learning_rate": 0.00014211208264755092,
790
+ "loss": 2.8675,
791
+ "step": 5700
792
+ },
793
+ {
794
+ "epoch": 4.64,
795
+ "learning_rate": 0.00014192653223785577,
796
+ "loss": 2.8653,
797
+ "step": 5750
798
+ },
799
+ {
800
+ "epoch": 4.68,
801
+ "learning_rate": 0.0001417389487819156,
802
+ "loss": 2.8709,
803
+ "step": 5800
804
+ },
805
+ {
806
+ "epoch": 4.72,
807
+ "learning_rate": 0.00014154933797800621,
808
+ "loss": 2.8535,
809
+ "step": 5850
810
+ },
811
+ {
812
+ "epoch": 4.77,
813
+ "learning_rate": 0.0001413577055859888,
814
+ "loss": 2.8765,
815
+ "step": 5900
816
+ },
817
+ {
818
+ "epoch": 4.81,
819
+ "learning_rate": 0.00014116405742713484,
820
+ "loss": 2.8777,
821
+ "step": 5950
822
+ },
823
+ {
824
+ "epoch": 4.85,
825
+ "learning_rate": 0.00014096839938394936,
826
+ "loss": 2.8441,
827
+ "step": 6000
828
+ },
829
+ {
830
+ "epoch": 4.85,
831
+ "eval_bleu": 1.2898999365765402,
832
+ "eval_loss": 3.4188201427459717,
833
+ "eval_runtime": 1959.6425,
834
+ "eval_samples_per_second": 4.495,
835
+ "eval_steps_per_second": 0.281,
836
+ "step": 6000
837
+ },
838
+ {
839
+ "epoch": 4.89,
840
+ "learning_rate": 0.00014077073739999222,
841
+ "loss": 2.8655,
842
+ "step": 6050
843
+ },
844
+ {
845
+ "epoch": 4.93,
846
+ "learning_rate": 0.0001405710774796975,
847
+ "loss": 2.8427,
848
+ "step": 6100
849
+ },
850
+ {
851
+ "epoch": 4.97,
852
+ "learning_rate": 0.0001403694256881913,
853
+ "loss": 2.8573,
854
+ "step": 6150
855
+ },
856
+ {
857
+ "epoch": 5.01,
858
+ "learning_rate": 0.00014016578815110716,
859
+ "loss": 2.8372,
860
+ "step": 6200
861
+ },
862
+ {
863
+ "epoch": 5.05,
864
+ "learning_rate": 0.00013996017105440036,
865
+ "loss": 2.5577,
866
+ "step": 6250
867
+ },
868
+ {
869
+ "epoch": 5.09,
870
+ "learning_rate": 0.00013975258064415972,
871
+ "loss": 2.5632,
872
+ "step": 6300
873
+ },
874
+ {
875
+ "epoch": 5.13,
876
+ "learning_rate": 0.00013954302322641797,
877
+ "loss": 2.5707,
878
+ "step": 6350
879
+ },
880
+ {
881
+ "epoch": 5.17,
882
+ "learning_rate": 0.00013933150516696024,
883
+ "loss": 2.5991,
884
+ "step": 6400
885
+ },
886
+ {
887
+ "epoch": 5.21,
888
+ "learning_rate": 0.00013911803289113055,
889
+ "loss": 2.5697,
890
+ "step": 6450
891
+ },
892
+ {
893
+ "epoch": 5.25,
894
+ "learning_rate": 0.00013890261288363676,
895
+ "loss": 2.5857,
896
+ "step": 6500
897
+ },
898
+ {
899
+ "epoch": 5.25,
900
+ "eval_bleu": 1.3087062882464005,
901
+ "eval_loss": 3.4442849159240723,
902
+ "eval_runtime": 1957.1145,
903
+ "eval_samples_per_second": 4.501,
904
+ "eval_steps_per_second": 0.282,
905
+ "step": 6500
906
+ },
907
+ {
908
+ "epoch": 5.29,
909
+ "learning_rate": 0.00013868525168835353,
910
+ "loss": 2.614,
911
+ "step": 6550
912
+ },
913
+ {
914
+ "epoch": 5.33,
915
+ "learning_rate": 0.0001384659559081235,
916
+ "loss": 2.6133,
917
+ "step": 6600
918
+ },
919
+ {
920
+ "epoch": 5.37,
921
+ "learning_rate": 0.0001382447322045568,
922
+ "loss": 2.5898,
923
+ "step": 6650
924
+ },
925
+ {
926
+ "epoch": 5.41,
927
+ "learning_rate": 0.0001380215872978285,
928
+ "loss": 2.5992,
929
+ "step": 6700
930
+ },
931
+ {
932
+ "epoch": 5.45,
933
+ "learning_rate": 0.0001377965279664748,
934
+ "loss": 2.5873,
935
+ "step": 6750
936
+ },
937
+ {
938
+ "epoch": 5.49,
939
+ "learning_rate": 0.0001375695610471868,
940
+ "loss": 2.6155,
941
+ "step": 6800
942
+ },
943
+ {
944
+ "epoch": 5.53,
945
+ "learning_rate": 0.00013734069343460293,
946
+ "loss": 2.6044,
947
+ "step": 6850
948
+ },
949
+ {
950
+ "epoch": 5.57,
951
+ "learning_rate": 0.0001371099320810995,
952
+ "loss": 2.6239,
953
+ "step": 6900
954
+ },
955
+ {
956
+ "epoch": 5.61,
957
+ "learning_rate": 0.0001368772839965797,
958
+ "loss": 2.6267,
959
+ "step": 6950
960
+ },
961
+ {
962
+ "epoch": 5.65,
963
+ "learning_rate": 0.00013664275624826025,
964
+ "loss": 2.6094,
965
+ "step": 7000
966
+ },
967
+ {
968
+ "epoch": 5.65,
969
+ "eval_bleu": 1.423644909665514,
970
+ "eval_loss": 3.3920862674713135,
971
+ "eval_runtime": 1956.3214,
972
+ "eval_samples_per_second": 4.502,
973
+ "eval_steps_per_second": 0.282,
974
+ "step": 7000
975
+ },
976
+ {
977
+ "epoch": 5.69,
978
+ "learning_rate": 0.00013640635596045707,
979
+ "loss": 2.6098,
980
+ "step": 7050
981
+ },
982
+ {
983
+ "epoch": 5.73,
984
+ "learning_rate": 0.00013616809031436876,
985
+ "loss": 2.5955,
986
+ "step": 7100
987
+ },
988
+ {
989
+ "epoch": 5.77,
990
+ "learning_rate": 0.0001359279665478584,
991
+ "loss": 2.5979,
992
+ "step": 7150
993
+ },
994
+ {
995
+ "epoch": 5.82,
996
+ "learning_rate": 0.0001356859919552337,
997
+ "loss": 2.6305,
998
+ "step": 7200
999
+ },
1000
+ {
1001
+ "epoch": 5.86,
1002
+ "learning_rate": 0.0001354421738870255,
1003
+ "loss": 2.6193,
1004
+ "step": 7250
1005
+ },
1006
+ {
1007
+ "epoch": 5.9,
1008
+ "learning_rate": 0.00013519651974976433,
1009
+ "loss": 2.6063,
1010
+ "step": 7300
1011
+ },
1012
+ {
1013
+ "epoch": 5.94,
1014
+ "learning_rate": 0.00013494903700575562,
1015
+ "loss": 2.6187,
1016
+ "step": 7350
1017
+ },
1018
+ {
1019
+ "epoch": 5.98,
1020
+ "learning_rate": 0.00013469973317285284,
1021
+ "loss": 2.6155,
1022
+ "step": 7400
1023
+ },
1024
+ {
1025
+ "epoch": 6.02,
1026
+ "learning_rate": 0.0001344486158242292,
1027
+ "loss": 2.5029,
1028
+ "step": 7450
1029
+ },
1030
+ {
1031
+ "epoch": 6.06,
1032
+ "learning_rate": 0.00013419569258814757,
1033
+ "loss": 2.3144,
1034
+ "step": 7500
1035
+ },
1036
+ {
1037
+ "epoch": 6.06,
1038
+ "eval_bleu": 1.4240728657820112,
1039
+ "eval_loss": 3.422335624694824,
1040
+ "eval_runtime": 1958.4348,
1041
+ "eval_samples_per_second": 4.497,
1042
+ "eval_steps_per_second": 0.281,
1043
+ "step": 7500
1044
+ },
1045
+ {
1046
+ "epoch": 6.1,
1047
+ "learning_rate": 0.00013394097114772887,
1048
+ "loss": 2.3243,
1049
+ "step": 7550
1050
+ },
1051
+ {
1052
+ "epoch": 6.14,
1053
+ "learning_rate": 0.00013368445924071844,
1054
+ "loss": 2.3288,
1055
+ "step": 7600
1056
+ },
1057
+ {
1058
+ "epoch": 6.18,
1059
+ "learning_rate": 0.00013342616465925126,
1060
+ "loss": 2.3569,
1061
+ "step": 7650
1062
+ },
1063
+ {
1064
+ "epoch": 6.22,
1065
+ "learning_rate": 0.00013316609524961502,
1066
+ "loss": 2.3593,
1067
+ "step": 7700
1068
+ },
1069
+ {
1070
+ "epoch": 6.26,
1071
+ "learning_rate": 0.00013290425891201196,
1072
+ "loss": 2.3579,
1073
+ "step": 7750
1074
+ },
1075
+ {
1076
+ "epoch": 6.3,
1077
+ "learning_rate": 0.00013264066360031872,
1078
+ "loss": 2.3623,
1079
+ "step": 7800
1080
+ },
1081
+ {
1082
+ "epoch": 6.34,
1083
+ "learning_rate": 0.0001323753173218448,
1084
+ "loss": 2.3642,
1085
+ "step": 7850
1086
+ },
1087
+ {
1088
+ "epoch": 6.38,
1089
+ "learning_rate": 0.00013210822813708936,
1090
+ "loss": 2.3494,
1091
+ "step": 7900
1092
+ },
1093
+ {
1094
+ "epoch": 6.42,
1095
+ "learning_rate": 0.0001318394041594963,
1096
+ "loss": 2.3568,
1097
+ "step": 7950
1098
+ },
1099
+ {
1100
+ "epoch": 6.46,
1101
+ "learning_rate": 0.00013156885355520778,
1102
+ "loss": 2.3675,
1103
+ "step": 8000
1104
+ },
1105
+ {
1106
+ "epoch": 6.46,
1107
+ "eval_bleu": 1.716707873647693,
1108
+ "eval_loss": 3.403074026107788,
1109
+ "eval_runtime": 1958.6783,
1110
+ "eval_samples_per_second": 4.497,
1111
+ "eval_steps_per_second": 0.281,
1112
+ "step": 8000
1113
+ },
1114
+ {
1115
+ "epoch": 6.5,
1116
+ "learning_rate": 0.0001312965845428162,
1117
+ "loss": 2.3814,
1118
+ "step": 8050
1119
+ },
1120
+ {
1121
+ "epoch": 6.54,
1122
+ "learning_rate": 0.0001310226053931146,
1123
+ "loss": 2.3737,
1124
+ "step": 8100
1125
+ },
1126
+ {
1127
+ "epoch": 6.58,
1128
+ "learning_rate": 0.00013074692442884524,
1129
+ "loss": 2.3713,
1130
+ "step": 8150
1131
+ },
1132
+ {
1133
+ "epoch": 6.62,
1134
+ "learning_rate": 0.00013046955002444697,
1135
+ "loss": 2.3989,
1136
+ "step": 8200
1137
+ },
1138
+ {
1139
+ "epoch": 6.66,
1140
+ "learning_rate": 0.00013019049060580067,
1141
+ "loss": 2.377,
1142
+ "step": 8250
1143
+ },
1144
+ {
1145
+ "epoch": 6.7,
1146
+ "learning_rate": 0.00012991538574407902,
1147
+ "loss": 2.3949,
1148
+ "step": 8300
1149
+ },
1150
+ {
1151
+ "epoch": 6.74,
1152
+ "learning_rate": 0.0001296330150553478,
1153
+ "loss": 2.3995,
1154
+ "step": 8350
1155
+ },
1156
+ {
1157
+ "epoch": 6.78,
1158
+ "learning_rate": 0.00012934898476402848,
1159
+ "loss": 2.3936,
1160
+ "step": 8400
1161
+ },
1162
+ {
1163
+ "epoch": 6.83,
1164
+ "learning_rate": 0.0001290633034981898,
1165
+ "loss": 2.3825,
1166
+ "step": 8450
1167
+ },
1168
+ {
1169
+ "epoch": 6.87,
1170
+ "learning_rate": 0.00012877597993605252,
1171
+ "loss": 2.3931,
1172
+ "step": 8500
1173
+ },
1174
+ {
1175
+ "epoch": 6.87,
1176
+ "eval_bleu": 1.5281360670298858,
1177
+ "eval_loss": 3.3454949855804443,
1178
+ "eval_runtime": 1957.7441,
1179
+ "eval_samples_per_second": 4.499,
1180
+ "eval_steps_per_second": 0.281,
1181
+ "step": 8500
1182
+ },
1183
+ {
1184
+ "epoch": 6.91,
1185
+ "learning_rate": 0.00012848702280572602,
1186
+ "loss": 2.3948,
1187
+ "step": 8550
1188
+ },
1189
+ {
1190
+ "epoch": 6.95,
1191
+ "learning_rate": 0.0001281964408849429,
1192
+ "loss": 2.4046,
1193
+ "step": 8600
1194
+ },
1195
+ {
1196
+ "epoch": 6.99,
1197
+ "learning_rate": 0.00012790424300079258,
1198
+ "loss": 2.4168,
1199
+ "step": 8650
1200
+ },
1201
+ {
1202
+ "epoch": 7.03,
1203
+ "learning_rate": 0.00012761043802945305,
1204
+ "loss": 2.238,
1205
+ "step": 8700
1206
+ },
1207
+ {
1208
+ "epoch": 7.07,
1209
+ "learning_rate": 0.00012731503489592122,
1210
+ "loss": 2.1129,
1211
+ "step": 8750
1212
+ },
1213
+ {
1214
+ "epoch": 7.11,
1215
+ "learning_rate": 0.0001270180425737418,
1216
+ "loss": 2.1072,
1217
+ "step": 8800
1218
+ },
1219
+ {
1220
+ "epoch": 7.15,
1221
+ "learning_rate": 0.00012671947008473483,
1222
+ "loss": 2.1193,
1223
+ "step": 8850
1224
+ },
1225
+ {
1226
+ "epoch": 7.19,
1227
+ "learning_rate": 0.00012641932649872138,
1228
+ "loss": 2.1408,
1229
+ "step": 8900
1230
+ },
1231
+ {
1232
+ "epoch": 7.23,
1233
+ "learning_rate": 0.00012611762093324829,
1234
+ "loss": 2.139,
1235
+ "step": 8950
1236
+ },
1237
+ {
1238
+ "epoch": 7.27,
1239
+ "learning_rate": 0.00012581436255331107,
1240
+ "loss": 2.1578,
1241
+ "step": 9000
1242
+ },
1243
+ {
1244
+ "epoch": 7.27,
1245
+ "eval_bleu": 1.6379507937528601,
1246
+ "eval_loss": 3.419518232345581,
1247
+ "eval_runtime": 1957.3561,
1248
+ "eval_samples_per_second": 4.5,
1249
+ "eval_steps_per_second": 0.282,
1250
+ "step": 9000
1251
+ },
1252
+ {
1253
+ "epoch": 7.31,
1254
+ "learning_rate": 0.00012550956057107553,
1255
+ "loss": 2.1684,
1256
+ "step": 9050
1257
+ },
1258
+ {
1259
+ "epoch": 7.35,
1260
+ "learning_rate": 0.0001252032242455979,
1261
+ "loss": 2.1481,
1262
+ "step": 9100
1263
+ },
1264
+ {
1265
+ "epoch": 7.39,
1266
+ "learning_rate": 0.00012489536288254354,
1267
+ "loss": 2.1518,
1268
+ "step": 9150
1269
+ },
1270
+ {
1271
+ "epoch": 7.43,
1272
+ "learning_rate": 0.0001245859858339044,
1273
+ "loss": 2.1639,
1274
+ "step": 9200
1275
+ },
1276
+ {
1277
+ "epoch": 7.47,
1278
+ "learning_rate": 0.00012427510249771485,
1279
+ "loss": 2.173,
1280
+ "step": 9250
1281
+ },
1282
+ {
1283
+ "epoch": 7.51,
1284
+ "learning_rate": 0.00012396272231776601,
1285
+ "loss": 2.1753,
1286
+ "step": 9300
1287
+ },
1288
+ {
1289
+ "epoch": 7.55,
1290
+ "learning_rate": 0.00012364885478331931,
1291
+ "loss": 2.1735,
1292
+ "step": 9350
1293
+ },
1294
+ {
1295
+ "epoch": 7.59,
1296
+ "learning_rate": 0.0001233335094288177,
1297
+ "loss": 2.1847,
1298
+ "step": 9400
1299
+ },
1300
+ {
1301
+ "epoch": 7.63,
1302
+ "learning_rate": 0.00012301669583359657,
1303
+ "loss": 2.1763,
1304
+ "step": 9450
1305
+ },
1306
+ {
1307
+ "epoch": 7.67,
1308
+ "learning_rate": 0.0001226984236215922,
1309
+ "loss": 2.1872,
1310
+ "step": 9500
1311
+ },
1312
+ {
1313
+ "epoch": 7.67,
1314
+ "eval_bleu": 1.6695642693305304,
1315
+ "eval_loss": 3.401309013366699,
1316
+ "eval_runtime": 1957.1624,
1317
+ "eval_samples_per_second": 4.5,
1318
+ "eval_steps_per_second": 0.282,
1319
+ "step": 9500
1320
+ },
1321
+ {
1322
+ "epoch": 7.71,
1323
+ "learning_rate": 0.00012237870246104989,
1324
+ "loss": 2.1915,
1325
+ "step": 9550
1326
+ },
1327
+ {
1328
+ "epoch": 7.75,
1329
+ "learning_rate": 0.00012205754206422992,
1330
+ "loss": 2.1994,
1331
+ "step": 9600
1332
+ },
1333
+ {
1334
+ "epoch": 7.79,
1335
+ "learning_rate": 0.00012173495218711278,
1336
+ "loss": 2.1896,
1337
+ "step": 9650
1338
+ },
1339
+ {
1340
+ "epoch": 7.83,
1341
+ "learning_rate": 0.00012141094262910259,
1342
+ "loss": 2.1748,
1343
+ "step": 9700
1344
+ },
1345
+ {
1346
+ "epoch": 7.88,
1347
+ "learning_rate": 0.00012108552323272957,
1348
+ "loss": 2.1717,
1349
+ "step": 9750
1350
+ },
1351
+ {
1352
+ "epoch": 7.92,
1353
+ "learning_rate": 0.00012075870388335098,
1354
+ "loss": 2.219,
1355
+ "step": 9800
1356
+ },
1357
+ {
1358
+ "epoch": 7.96,
1359
+ "learning_rate": 0.00012043049450885088,
1360
+ "loss": 2.1897,
1361
+ "step": 9850
1362
+ },
1363
+ {
1364
+ "epoch": 8.0,
1365
+ "learning_rate": 0.00012010090507933847,
1366
+ "loss": 2.2052,
1367
+ "step": 9900
1368
+ },
1369
+ {
1370
+ "epoch": 8.04,
1371
+ "learning_rate": 0.00011976994560684529,
1372
+ "loss": 1.9818,
1373
+ "step": 9950
1374
+ },
1375
+ {
1376
+ "epoch": 8.08,
1377
+ "learning_rate": 0.0001194376261450211,
1378
+ "loss": 1.9173,
1379
+ "step": 10000
1380
+ },
1381
+ {
1382
+ "epoch": 8.08,
1383
+ "eval_bleu": 1.65311690438908,
1384
+ "eval_loss": 3.501976728439331,
1385
+ "eval_runtime": 1960.8318,
1386
+ "eval_samples_per_second": 4.492,
1387
+ "eval_steps_per_second": 0.281,
1388
+ "step": 10000
1389
+ },
1390
+ {
1391
+ "epoch": 8.12,
1392
+ "learning_rate": 0.00011910395678882841,
1393
+ "loss": 1.9281,
1394
+ "step": 10050
1395
+ },
1396
+ {
1397
+ "epoch": 8.16,
1398
+ "learning_rate": 0.00011876894767423582,
1399
+ "loss": 1.946,
1400
+ "step": 10100
1401
+ },
1402
+ {
1403
+ "epoch": 8.2,
1404
+ "learning_rate": 0.00011843260897791022,
1405
+ "loss": 1.9537,
1406
+ "step": 10150
1407
+ },
1408
+ {
1409
+ "epoch": 8.24,
1410
+ "learning_rate": 0.00011809495091690755,
1411
+ "loss": 1.965,
1412
+ "step": 10200
1413
+ },
1414
+ {
1415
+ "epoch": 8.28,
1416
+ "learning_rate": 0.00011775598374836245,
1417
+ "loss": 1.963,
1418
+ "step": 10250
1419
+ },
1420
+ {
1421
+ "epoch": 8.32,
1422
+ "learning_rate": 0.00011741571776917671,
1423
+ "loss": 1.9654,
1424
+ "step": 10300
1425
+ },
1426
+ {
1427
+ "epoch": 8.36,
1428
+ "learning_rate": 0.00011707416331570644,
1429
+ "loss": 1.9705,
1430
+ "step": 10350
1431
+ },
1432
+ {
1433
+ "epoch": 8.4,
1434
+ "learning_rate": 0.00011673133076344812,
1435
+ "loss": 1.9857,
1436
+ "step": 10400
1437
+ },
1438
+ {
1439
+ "epoch": 8.44,
1440
+ "learning_rate": 0.00011638723052672338,
1441
+ "loss": 1.9584,
1442
+ "step": 10450
1443
+ },
1444
+ {
1445
+ "epoch": 8.48,
1446
+ "learning_rate": 0.00011604187305836272,
1447
+ "loss": 1.9589,
1448
+ "step": 10500
1449
+ },
1450
+ {
1451
+ "epoch": 8.48,
1452
+ "eval_bleu": 1.7329668869688968,
1453
+ "eval_loss": 3.4673211574554443,
1454
+ "eval_runtime": 1960.7298,
1455
+ "eval_samples_per_second": 4.492,
1456
+ "eval_steps_per_second": 0.281,
1457
+ "step": 10500
1458
+ },
1459
+ {
1460
+ "epoch": 8.52,
1461
+ "learning_rate": 0.00011569526884938779,
1462
+ "loss": 1.9859,
1463
+ "step": 10550
1464
+ },
1465
+ {
1466
+ "epoch": 8.56,
1467
+ "learning_rate": 0.000115347428428693,
1468
+ "loss": 1.9883,
1469
+ "step": 10600
1470
+ },
1471
+ {
1472
+ "epoch": 8.6,
1473
+ "learning_rate": 0.00011499836236272536,
1474
+ "loss": 2.0001,
1475
+ "step": 10650
1476
+ },
1477
+ {
1478
+ "epoch": 8.64,
1479
+ "learning_rate": 0.00011464808125516378,
1480
+ "loss": 2.0007,
1481
+ "step": 10700
1482
+ },
1483
+ {
1484
+ "epoch": 8.68,
1485
+ "learning_rate": 0.00011429659574659675,
1486
+ "loss": 2.006,
1487
+ "step": 10750
1488
+ },
1489
+ {
1490
+ "epoch": 8.72,
1491
+ "learning_rate": 0.00011394391651419925,
1492
+ "loss": 2.0108,
1493
+ "step": 10800
1494
+ },
1495
+ {
1496
+ "epoch": 8.76,
1497
+ "learning_rate": 0.00011359005427140825,
1498
+ "loss": 2.0178,
1499
+ "step": 10850
1500
+ },
1501
+ {
1502
+ "epoch": 8.8,
1503
+ "learning_rate": 0.00011323501976759747,
1504
+ "loss": 2.0067,
1505
+ "step": 10900
1506
+ },
1507
+ {
1508
+ "epoch": 8.84,
1509
+ "learning_rate": 0.00011287882378775071,
1510
+ "loss": 2.0209,
1511
+ "step": 10950
1512
+ },
1513
+ {
1514
+ "epoch": 8.88,
1515
+ "learning_rate": 0.0001125214771521342,
1516
+ "loss": 2.0189,
1517
+ "step": 11000
1518
+ },
1519
+ {
1520
+ "epoch": 8.88,
1521
+ "eval_bleu": 1.79820635406132,
1522
+ "eval_loss": 3.4235384464263916,
1523
+ "eval_runtime": 1972.6388,
1524
+ "eval_samples_per_second": 4.465,
1525
+ "eval_steps_per_second": 0.279,
1526
+ "step": 11000
1527
+ },
1528
+ {
1529
+ "epoch": 8.93,
1530
+ "learning_rate": 0.00011216299071596803,
1531
+ "loss": 2.032,
1532
+ "step": 11050
1533
+ },
1534
+ {
1535
+ "epoch": 8.97,
1536
+ "learning_rate": 0.0001118033753690963,
1537
+ "loss": 2.0373,
1538
+ "step": 11100
1539
+ },
1540
+ {
1541
+ "epoch": 9.01,
1542
+ "learning_rate": 0.00011144264203565639,
1543
+ "loss": 1.9951,
1544
+ "step": 11150
1545
+ },
1546
+ {
1547
+ "epoch": 9.05,
1548
+ "learning_rate": 0.000111080801673747,
1549
+ "loss": 1.7539,
1550
+ "step": 11200
1551
+ },
1552
+ {
1553
+ "epoch": 9.09,
1554
+ "learning_rate": 0.00011071786527509542,
1555
+ "loss": 1.7594,
1556
+ "step": 11250
1557
+ },
1558
+ {
1559
+ "epoch": 9.13,
1560
+ "learning_rate": 0.00011035384386472355,
1561
+ "loss": 1.7635,
1562
+ "step": 11300
1563
+ },
1564
+ {
1565
+ "epoch": 9.17,
1566
+ "learning_rate": 0.00010998874850061295,
1567
+ "loss": 1.7876,
1568
+ "step": 11350
1569
+ },
1570
+ {
1571
+ "epoch": 9.21,
1572
+ "learning_rate": 0.00010962259027336899,
1573
+ "loss": 1.7757,
1574
+ "step": 11400
1575
+ },
1576
+ {
1577
+ "epoch": 9.25,
1578
+ "learning_rate": 0.00010925538030588402,
1579
+ "loss": 1.7824,
1580
+ "step": 11450
1581
+ },
1582
+ {
1583
+ "epoch": 9.29,
1584
+ "learning_rate": 0.00010888712975299928,
1585
+ "loss": 1.7986,
1586
+ "step": 11500
1587
+ },
1588
+ {
1589
+ "epoch": 9.29,
1590
+ "eval_bleu": 1.8285756428941922,
1591
+ "eval_loss": 3.521059989929199,
1592
+ "eval_runtime": 1961.2988,
1593
+ "eval_samples_per_second": 4.491,
1594
+ "eval_steps_per_second": 0.281,
1595
+ "step": 11500
1596
+ },
1597
+ {
1598
+ "epoch": 9.33,
1599
+ "learning_rate": 0.00010851784980116624,
1600
+ "loss": 1.817,
1601
+ "step": 11550
1602
+ },
1603
+ {
1604
+ "epoch": 9.37,
1605
+ "learning_rate": 0.00010814755166810671,
1606
+ "loss": 1.82,
1607
+ "step": 11600
1608
+ },
1609
+ {
1610
+ "epoch": 9.41,
1611
+ "learning_rate": 0.00010777624660247202,
1612
+ "loss": 1.808,
1613
+ "step": 11650
1614
+ },
1615
+ {
1616
+ "epoch": 9.45,
1617
+ "learning_rate": 0.00010740394588350149,
1618
+ "loss": 1.8324,
1619
+ "step": 11700
1620
+ },
1621
+ {
1622
+ "epoch": 9.49,
1623
+ "learning_rate": 0.00010703066082067957,
1624
+ "loss": 1.8235,
1625
+ "step": 11750
1626
+ },
1627
+ {
1628
+ "epoch": 9.53,
1629
+ "learning_rate": 0.00010665640275339248,
1630
+ "loss": 1.8376,
1631
+ "step": 11800
1632
+ },
1633
+ {
1634
+ "epoch": 9.57,
1635
+ "learning_rate": 0.0001062811830505836,
1636
+ "loss": 1.8291,
1637
+ "step": 11850
1638
+ },
1639
+ {
1640
+ "epoch": 9.61,
1641
+ "learning_rate": 0.00010590501311040825,
1642
+ "loss": 1.8362,
1643
+ "step": 11900
1644
+ },
1645
+ {
1646
+ "epoch": 9.65,
1647
+ "learning_rate": 0.00010552790435988735,
1648
+ "loss": 1.8365,
1649
+ "step": 11950
1650
+ },
1651
+ {
1652
+ "epoch": 9.69,
1653
+ "learning_rate": 0.00010514986825456032,
1654
+ "loss": 1.8412,
1655
+ "step": 12000
1656
+ },
1657
+ {
1658
+ "epoch": 9.69,
1659
+ "eval_bleu": 1.8625396953625901,
1660
+ "eval_loss": 3.4715662002563477,
1661
+ "eval_runtime": 1961.6723,
1662
+ "eval_samples_per_second": 4.49,
1663
+ "eval_steps_per_second": 0.281,
1664
+ "step": 12000
1665
+ }
1666
+ ],
1667
+ "max_steps": 30000,
1668
+ "num_train_epochs": 25,
1669
+ "total_flos": 7.47246491258865e+20,
1670
+ "trial_name": null,
1671
+ "trial_params": null
1672
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5140a96fc22eda926b6bd84c7080d17a2f3291ee88007f8f6ae69d806afa26ba
3
+ size 3896
vocab.txt ADDED
The diff for this file is too large to render. See raw diff