samahadhoud commited on
Commit
cf84c4a
·
verified ·
1 Parent(s): 95b90c3

Add step-10000 checkpoint

Browse files
config.json ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "PIXELSumModel"
4
+ ],
5
+ "decoder": {
6
+ "_name_or_path": "gpt2",
7
+ "activation_function": "gelu_new",
8
+ "add_cross_attention": true,
9
+ "architectures": [
10
+ "GPT2LMHeadModel"
11
+ ],
12
+ "attn_pdrop": 0.1,
13
+ "bad_words_ids": null,
14
+ "bos_token_id": 50256,
15
+ "chunk_size_feed_forward": 0,
16
+ "cross_attention_hidden_size": null,
17
+ "cross_attention_reduce_factor": 1,
18
+ "decoder_start_token_id": null,
19
+ "diversity_penalty": 0.0,
20
+ "do_sample": false,
21
+ "early_stopping": false,
22
+ "embd_pdrop": 0.1,
23
+ "encoder_hidden_size": 768,
24
+ "encoder_no_repeat_ngram_size": 0,
25
+ "eos_token_id": 50256,
26
+ "exponential_decay_length_penalty": null,
27
+ "finetuning_task": null,
28
+ "forced_bos_token_id": null,
29
+ "forced_eos_token_id": null,
30
+ "id2label": {
31
+ "0": "LABEL_0",
32
+ "1": "LABEL_1"
33
+ },
34
+ "initializer_range": 0.02,
35
+ "is_decoder": true,
36
+ "is_encoder_decoder": false,
37
+ "label2id": {
38
+ "LABEL_0": 0,
39
+ "LABEL_1": 1
40
+ },
41
+ "layer_norm_epsilon": 1e-05,
42
+ "length_penalty": 1.0,
43
+ "max_length": 20,
44
+ "min_length": 0,
45
+ "model_type": "gpt2",
46
+ "n_ctx": 1024,
47
+ "n_embd": 768,
48
+ "n_head": 12,
49
+ "n_inner": null,
50
+ "n_layer": 12,
51
+ "n_positions": 1024,
52
+ "no_repeat_ngram_size": 0,
53
+ "num_beam_groups": 1,
54
+ "num_beams": 1,
55
+ "num_return_sequences": 1,
56
+ "output_attentions": false,
57
+ "output_hidden_states": false,
58
+ "output_scores": false,
59
+ "pad_token_id": null,
60
+ "prefix": null,
61
+ "problem_type": null,
62
+ "pruned_heads": {},
63
+ "remove_invalid_values": false,
64
+ "reorder_and_upcast_attn": false,
65
+ "repetition_penalty": 1.0,
66
+ "resid_pdrop": 0.1,
67
+ "return_dict": true,
68
+ "return_dict_in_generate": false,
69
+ "scale_attn_by_inverse_layer_idx": false,
70
+ "scale_attn_weights": true,
71
+ "sep_token_id": null,
72
+ "summary_activation": null,
73
+ "summary_first_dropout": 0.1,
74
+ "summary_proj_to_labels": true,
75
+ "summary_type": "cls_index",
76
+ "summary_use_proj": true,
77
+ "task_specific_params": {
78
+ "text-generation": {
79
+ "do_sample": true,
80
+ "max_length": 64
81
+ }
82
+ },
83
+ "temperature": 1.0,
84
+ "tie_encoder_decoder": false,
85
+ "tie_word_embeddings": true,
86
+ "tokenizer_class": null,
87
+ "top_k": 50,
88
+ "top_p": 1.0,
89
+ "torch_dtype": null,
90
+ "torchscript": false,
91
+ "transformers_version": "4.19.0",
92
+ "typical_p": 1.0,
93
+ "use_bfloat16": false,
94
+ "use_cache": true,
95
+ "vocab_size": 50257
96
+ },
97
+ "decoder_start_token_id": 50256,
98
+ "encoder": {
99
+ "_name_or_path": "Team-PIXEL/pixel-base",
100
+ "add_cross_attention": false,
101
+ "architectures": [
102
+ "PIXELForPreTraining"
103
+ ],
104
+ "attention_probs_dropout_prob": 0.1,
105
+ "bad_words_ids": null,
106
+ "bos_token_id": null,
107
+ "chunk_size_feed_forward": 0,
108
+ "cross_attention_hidden_size": null,
109
+ "decoder_hidden_size": 512,
110
+ "decoder_intermediate_size": 2048,
111
+ "decoder_num_attention_heads": 16,
112
+ "decoder_num_hidden_layers": 8,
113
+ "decoder_start_token_id": null,
114
+ "diversity_penalty": 0.0,
115
+ "do_eval": true,
116
+ "do_sample": false,
117
+ "early_stopping": false,
118
+ "encoder_no_repeat_ngram_size": 0,
119
+ "eos_token_id": null,
120
+ "exponential_decay_length_penalty": null,
121
+ "finetuning_task": null,
122
+ "forced_bos_token_id": null,
123
+ "forced_eos_token_id": null,
124
+ "hidden_act": "gelu",
125
+ "hidden_dropout_prob": 0.1,
126
+ "hidden_size": 768,
127
+ "id2label": {
128
+ "0": "LABEL_0",
129
+ "1": "LABEL_1"
130
+ },
131
+ "image_size": [
132
+ 16,
133
+ 8464
134
+ ],
135
+ "initializer_range": 0.02,
136
+ "intermediate_size": 3072,
137
+ "is_decoder": false,
138
+ "is_encoder_decoder": false,
139
+ "label2id": {
140
+ "LABEL_0": 0,
141
+ "LABEL_1": 1
142
+ },
143
+ "layer_norm_eps": 1e-12,
144
+ "length_penalty": 1.0,
145
+ "mask_ratio": 0.25,
146
+ "max_length": 20,
147
+ "min_length": 0,
148
+ "model_type": "pixel",
149
+ "no_repeat_ngram_size": 0,
150
+ "norm_pix_loss": true,
151
+ "num_attention_heads": 12,
152
+ "num_beam_groups": 1,
153
+ "num_beams": 1,
154
+ "num_channels": 3,
155
+ "num_hidden_layers": 12,
156
+ "num_return_sequences": 1,
157
+ "output_attentions": false,
158
+ "output_hidden_states": false,
159
+ "output_scores": false,
160
+ "pad_token_id": null,
161
+ "patch_size": 16,
162
+ "prefix": null,
163
+ "problem_type": null,
164
+ "pruned_heads": {},
165
+ "qkv_bias": true,
166
+ "remove_invalid_values": false,
167
+ "repetition_penalty": 1.0,
168
+ "return_dict": true,
169
+ "return_dict_in_generate": false,
170
+ "sep_token_id": null,
171
+ "task_specific_params": null,
172
+ "temperature": 1.0,
173
+ "tie_encoder_decoder": false,
174
+ "tie_word_embeddings": true,
175
+ "tokenizer_class": null,
176
+ "top_k": 50,
177
+ "top_p": 1.0,
178
+ "torch_dtype": "float32",
179
+ "torchscript": false,
180
+ "transformers_version": "4.19.0",
181
+ "typical_p": 1.0,
182
+ "use_bfloat16": false
183
+ },
184
+ "eos_token_id": 50256,
185
+ "is_encoder_decoder": true,
186
+ "model_type": "pixelsum",
187
+ "pad_token_id": 50256,
188
+ "tie_word_embeddings": false,
189
+ "torch_dtype": "float32",
190
+ "transformers_version": null,
191
+ "vocab_size": 50257
192
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c0c2461a01d8acf726a91c0121ed4716c53e02dec1dff23ef41f70d85d8005b
3
+ size 980777906
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
text_renderer_config.json ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "background_color": "white",
3
+ "dpi": 120,
4
+ "font_color": "black",
5
+ "font_file": "49e6dc219d1a1a1c9236acaf05a48b542002016a6dc877ee72baab085a84257b.3f28e7f4b38e1efe1b6da4a3732404c19d4c6a614ff32dce90a251e293d4ce58",
6
+ "font_size": 8,
7
+ "fonts_list": [
8
+ "Apple Color Emoji",
9
+ "DejaVu Sans",
10
+ "DejaVu Sans Mono",
11
+ "DejaVu Serif",
12
+ "Go Noto Current",
13
+ "Inconsolata",
14
+ "Monospace",
15
+ "Noto Fangsong KSS Rotated",
16
+ "Noto Fangsong KSS Vertical",
17
+ "Noto Kufi Arabic",
18
+ "Noto Music",
19
+ "Noto Naskh Arabic",
20
+ "Noto Naskh Arabic UI",
21
+ "Noto Nastaliq Urdu",
22
+ "Noto Rashi Hebrew",
23
+ "Noto Sans",
24
+ "Noto Sans Adlam",
25
+ "Noto Sans Adlam Unjoined",
26
+ "Noto Sans Anatolian Hieroglyphs",
27
+ "Noto Sans Arabic",
28
+ "Noto Sans Arabic UI",
29
+ "Noto Sans Armenian",
30
+ "Noto Sans Avestan",
31
+ "Noto Sans Balinese",
32
+ "Noto Sans Bamum",
33
+ "Noto Sans Bassa Vah",
34
+ "Noto Sans Batak",
35
+ "Noto Sans Bhaiksuki",
36
+ "Noto Sans Brahmi",
37
+ "Noto Sans Buginese",
38
+ "Noto Sans Buhid",
39
+ "Noto Sans CJK HK",
40
+ "Noto Sans CJK JP",
41
+ "Noto Sans CJK KR",
42
+ "Noto Sans CJK SC",
43
+ "Noto Sans CJK TC",
44
+ "Noto Sans Canadian Aboriginal",
45
+ "Noto Sans Carian",
46
+ "Noto Sans Caucasian Albanian",
47
+ "Noto Sans Chakma",
48
+ "Noto Sans Cham",
49
+ "Noto Sans Cherokee",
50
+ "Noto Sans Chorasmian",
51
+ "Noto Sans Coptic",
52
+ "Noto Sans Cuneiform",
53
+ "Noto Sans Cypriot",
54
+ "Noto Sans Cypro Minoan",
55
+ "Noto Sans Deseret",
56
+ "Noto Sans Devanagari",
57
+ "Noto Sans Devanagari UI",
58
+ "Noto Sans Duployan",
59
+ "Noto Sans Egyptian Hieroglyphs",
60
+ "Noto Sans Elbasan",
61
+ "Noto Sans Elymaic",
62
+ "Noto Sans Ethiopic",
63
+ "Noto Sans Georgian",
64
+ "Noto Sans Glagolitic",
65
+ "Noto Sans Gothic",
66
+ "Noto Sans Grantha",
67
+ "Noto Sans Gujarati",
68
+ "Noto Sans Gujarati UI",
69
+ "Noto Sans Gunjala Gondi",
70
+ "Noto Sans Gurmukhi",
71
+ "Noto Sans Gurmukhi UI",
72
+ "Noto Sans Hanifi Rohingya",
73
+ "Noto Sans Hanunoo",
74
+ "Noto Sans Hatran",
75
+ "Noto Sans Hebrew",
76
+ "Noto Sans Hebrew Droid",
77
+ "Noto Sans Imperial Aramaic",
78
+ "Noto Sans Indic Siyaq Numbers",
79
+ "Noto Sans Inscriptional Pahlavi",
80
+ "Noto Sans Inscriptional Parthian",
81
+ "Noto Sans Javanese",
82
+ "Noto Sans Kaithi",
83
+ "Noto Sans Kannada",
84
+ "Noto Sans Kannada UI",
85
+ "Noto Sans Kawi",
86
+ "Noto Sans Kayah Li",
87
+ "Noto Sans Kharoshthi",
88
+ "Noto Sans Khmer",
89
+ "Noto Sans Khmer UI",
90
+ "Noto Sans Khudawadi",
91
+ "Noto Sans Lao",
92
+ "Noto Sans Lao Looped",
93
+ "Noto Sans Lao Looped UI",
94
+ "Noto Sans Lao UI",
95
+ "Noto Sans Lepcha",
96
+ "Noto Sans Limbu",
97
+ "Noto Sans Linear A",
98
+ "Noto Sans Linear B",
99
+ "Noto Sans Lisu",
100
+ "Noto Sans Lycian",
101
+ "Noto Sans Lydian",
102
+ "Noto Sans Mahajani",
103
+ "Noto Sans Malayalam",
104
+ "Noto Sans Malayalam UI",
105
+ "Noto Sans Mandaic",
106
+ "Noto Sans Manichaean",
107
+ "Noto Sans Marchen",
108
+ "Noto Sans Masaram Gondi",
109
+ "Noto Sans Math",
110
+ "Noto Sans Mayan Numerals",
111
+ "Noto Sans Medefaidrin",
112
+ "Noto Sans Meetei Mayek",
113
+ "Noto Sans Mende Kikakui",
114
+ "Noto Sans Meroitic",
115
+ "Noto Sans Miao",
116
+ "Noto Sans Modi",
117
+ "Noto Sans Mongolian",
118
+ "Noto Sans Mono",
119
+ "Noto Sans Mro",
120
+ "Noto Sans Multani",
121
+ "Noto Sans Myanmar",
122
+ "Noto Sans Myanmar UI",
123
+ "Noto Sans NKo",
124
+ "Noto Sans NKo Unjoined",
125
+ "Noto Sans Nabataean",
126
+ "Noto Sans Nag Mundari",
127
+ "Noto Sans Nandinagari",
128
+ "Noto Sans New Tai Lue",
129
+ "Noto Sans Newa",
130
+ "Noto Sans Nushu",
131
+ "Noto Sans Ogham",
132
+ "Noto Sans Ol Chiki",
133
+ "Noto Sans Old Hungarian",
134
+ "Noto Sans Old Italic",
135
+ "Noto Sans Old North Arabian",
136
+ "Noto Sans Old Permic",
137
+ "Noto Sans Old Persian",
138
+ "Noto Sans Old Sogdian",
139
+ "Noto Sans Old South Arabian",
140
+ "Noto Sans Old Turkic",
141
+ "Noto Sans Oriya",
142
+ "Noto Sans Osage",
143
+ "Noto Sans Osmanya",
144
+ "Noto Sans Pahawh Hmong",
145
+ "Noto Sans Palmyrene",
146
+ "Noto Sans Pau Cin Hau",
147
+ "Noto Sans PhagsPa",
148
+ "Noto Sans Phoenician",
149
+ "Noto Sans Psalter Pahlavi",
150
+ "Noto Sans Rejang",
151
+ "Noto Sans Runic",
152
+ "Noto Sans Samaritan",
153
+ "Noto Sans Saurashtra",
154
+ "Noto Sans Sharada",
155
+ "Noto Sans Shavian",
156
+ "Noto Sans Siddham",
157
+ "Noto Sans SignWriting",
158
+ "Noto Sans Sinhala",
159
+ "Noto Sans Sinhala UI",
160
+ "Noto Sans Sogdian",
161
+ "Noto Sans Sora Sompeng",
162
+ "Noto Sans Soyombo",
163
+ "Noto Sans Sundanese",
164
+ "Noto Sans Sunuwar",
165
+ "Noto Sans Syloti Nagri",
166
+ "Noto Sans Symbols",
167
+ "Noto Sans Symbols 2",
168
+ "Noto Sans Syriac",
169
+ "Noto Sans Syriac Eastern",
170
+ "Noto Sans Syriac Western",
171
+ "Noto Sans Tagalog",
172
+ "Noto Sans Tagbanwa",
173
+ "Noto Sans Tai Le",
174
+ "Noto Sans Tai Tham",
175
+ "Noto Sans Tai Viet",
176
+ "Noto Sans Takri",
177
+ "Noto Sans Tamil",
178
+ "Noto Sans Tamil Supplement",
179
+ "Noto Sans Tamil UI",
180
+ "Noto Sans Tangsa",
181
+ "Noto Sans Telugu",
182
+ "Noto Sans Telugu UI",
183
+ "Noto Sans Test",
184
+ "Noto Sans Thaana",
185
+ "Noto Sans Thai",
186
+ "Noto Sans Thai Looped",
187
+ "Noto Sans Thai Looped UI",
188
+ "Noto Sans Thai UI",
189
+ "Noto Sans Tifinagh",
190
+ "Noto Sans Tifinagh APT",
191
+ "Noto Sans Tifinagh Adrar",
192
+ "Noto Sans Tifinagh Agraw Imazighen",
193
+ "Noto Sans Tifinagh Ahaggar",
194
+ "Noto Sans Tifinagh Air",
195
+ "Noto Sans Tifinagh Azawagh",
196
+ "Noto Sans Tifinagh Ghat",
197
+ "Noto Sans Tifinagh Hawad",
198
+ "Noto Sans Tifinagh Rhissa Ixa",
199
+ "Noto Sans Tifinagh SIL",
200
+ "Noto Sans Tifinagh Tawellemmet",
201
+ "Noto Sans Tirhuta",
202
+ "Noto Sans Ugaritic",
203
+ "Noto Sans Vai",
204
+ "Noto Sans Vithkuqi",
205
+ "Noto Sans Wancho",
206
+ "Noto Sans Warang Citi",
207
+ "Noto Sans Yi",
208
+ "Noto Sans Zanabazar Square",
209
+ "Noto Traditional Nushu",
210
+ "Noto Znamenny Musical Notation",
211
+ "NotoSansOldHungarianUI",
212
+ "Sans",
213
+ "Serif",
214
+ "Source Code Pro",
215
+ "System-ui",
216
+ "Ubuntu",
217
+ "Ubuntu Condensed",
218
+ "Ubuntu Mono"
219
+ ],
220
+ "max_seq_length": 529,
221
+ "pad_size": 3,
222
+ "pixels_per_patch": 16,
223
+ "rgb": false,
224
+ "text_renderer_type": "PangoCairoTextRenderer"
225
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "bert-base-uncased", "tokenizer_class": "BertTokenizer"}
trainer_state.json ADDED
@@ -0,0 +1,1120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 1.98361849433367,
3
+ "best_model_checkpoint": "experiments/translation/gpt2/2025-06-24_10-08-42/checkpoint-8000",
4
+ "epoch": 6.4617558022199795,
5
+ "global_step": 8000,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.04,
12
+ "learning_rate": 4.8e-06,
13
+ "loss": 5.4804,
14
+ "step": 50
15
+ },
16
+ {
17
+ "epoch": 0.08,
18
+ "learning_rate": 9.799999999999998e-06,
19
+ "loss": 4.9145,
20
+ "step": 100
21
+ },
22
+ {
23
+ "epoch": 0.12,
24
+ "learning_rate": 1.4799999999999999e-05,
25
+ "loss": 4.7992,
26
+ "step": 150
27
+ },
28
+ {
29
+ "epoch": 0.16,
30
+ "learning_rate": 1.98e-05,
31
+ "loss": 4.7128,
32
+ "step": 200
33
+ },
34
+ {
35
+ "epoch": 0.2,
36
+ "learning_rate": 2.4799999999999996e-05,
37
+ "loss": 4.6691,
38
+ "step": 250
39
+ },
40
+ {
41
+ "epoch": 0.24,
42
+ "learning_rate": 2.9799999999999996e-05,
43
+ "loss": 4.6397,
44
+ "step": 300
45
+ },
46
+ {
47
+ "epoch": 0.28,
48
+ "learning_rate": 3.48e-05,
49
+ "loss": 4.591,
50
+ "step": 350
51
+ },
52
+ {
53
+ "epoch": 0.32,
54
+ "learning_rate": 3.979999999999999e-05,
55
+ "loss": 4.5646,
56
+ "step": 400
57
+ },
58
+ {
59
+ "epoch": 0.36,
60
+ "learning_rate": 4.48e-05,
61
+ "loss": 4.5257,
62
+ "step": 450
63
+ },
64
+ {
65
+ "epoch": 0.4,
66
+ "learning_rate": 4.98e-05,
67
+ "loss": 4.4901,
68
+ "step": 500
69
+ },
70
+ {
71
+ "epoch": 0.4,
72
+ "eval_bleu": 0.11633427622704212,
73
+ "eval_loss": 4.419718265533447,
74
+ "eval_runtime": 1914.9377,
75
+ "eval_samples_per_second": 4.6,
76
+ "eval_steps_per_second": 0.144,
77
+ "step": 500
78
+ },
79
+ {
80
+ "epoch": 0.44,
81
+ "learning_rate": 5.48e-05,
82
+ "loss": 4.5062,
83
+ "step": 550
84
+ },
85
+ {
86
+ "epoch": 0.48,
87
+ "learning_rate": 5.98e-05,
88
+ "loss": 4.4699,
89
+ "step": 600
90
+ },
91
+ {
92
+ "epoch": 0.52,
93
+ "learning_rate": 6.479999999999999e-05,
94
+ "loss": 4.4781,
95
+ "step": 650
96
+ },
97
+ {
98
+ "epoch": 0.57,
99
+ "learning_rate": 6.979999999999999e-05,
100
+ "loss": 4.433,
101
+ "step": 700
102
+ },
103
+ {
104
+ "epoch": 0.61,
105
+ "learning_rate": 7.479999999999999e-05,
106
+ "loss": 4.4437,
107
+ "step": 750
108
+ },
109
+ {
110
+ "epoch": 0.65,
111
+ "learning_rate": 7.98e-05,
112
+ "loss": 4.4278,
113
+ "step": 800
114
+ },
115
+ {
116
+ "epoch": 0.69,
117
+ "learning_rate": 8.48e-05,
118
+ "loss": 4.4062,
119
+ "step": 850
120
+ },
121
+ {
122
+ "epoch": 0.73,
123
+ "learning_rate": 8.98e-05,
124
+ "loss": 4.3717,
125
+ "step": 900
126
+ },
127
+ {
128
+ "epoch": 0.77,
129
+ "learning_rate": 9.479999999999999e-05,
130
+ "loss": 4.3586,
131
+ "step": 950
132
+ },
133
+ {
134
+ "epoch": 0.81,
135
+ "learning_rate": 9.979999999999999e-05,
136
+ "loss": 4.3929,
137
+ "step": 1000
138
+ },
139
+ {
140
+ "epoch": 0.81,
141
+ "eval_bleu": 0.15546068077729897,
142
+ "eval_loss": 4.284366607666016,
143
+ "eval_runtime": 1911.4105,
144
+ "eval_samples_per_second": 4.608,
145
+ "eval_steps_per_second": 0.144,
146
+ "step": 1000
147
+ },
148
+ {
149
+ "epoch": 0.85,
150
+ "learning_rate": 0.00010479999999999999,
151
+ "loss": 4.3491,
152
+ "step": 1050
153
+ },
154
+ {
155
+ "epoch": 0.89,
156
+ "learning_rate": 0.00010979999999999999,
157
+ "loss": 4.3284,
158
+ "step": 1100
159
+ },
160
+ {
161
+ "epoch": 0.93,
162
+ "learning_rate": 0.00011479999999999999,
163
+ "loss": 4.3209,
164
+ "step": 1150
165
+ },
166
+ {
167
+ "epoch": 0.97,
168
+ "learning_rate": 0.00011979999999999998,
169
+ "loss": 4.3172,
170
+ "step": 1200
171
+ },
172
+ {
173
+ "epoch": 1.01,
174
+ "learning_rate": 0.00012479999999999997,
175
+ "loss": 4.3263,
176
+ "step": 1250
177
+ },
178
+ {
179
+ "epoch": 1.05,
180
+ "learning_rate": 0.00012979999999999998,
181
+ "loss": 4.1922,
182
+ "step": 1300
183
+ },
184
+ {
185
+ "epoch": 1.09,
186
+ "learning_rate": 0.00013479999999999997,
187
+ "loss": 4.1936,
188
+ "step": 1350
189
+ },
190
+ {
191
+ "epoch": 1.13,
192
+ "learning_rate": 0.00013979999999999998,
193
+ "loss": 4.1718,
194
+ "step": 1400
195
+ },
196
+ {
197
+ "epoch": 1.17,
198
+ "learning_rate": 0.0001448,
199
+ "loss": 4.1767,
200
+ "step": 1450
201
+ },
202
+ {
203
+ "epoch": 1.21,
204
+ "learning_rate": 0.00014979999999999998,
205
+ "loss": 4.1589,
206
+ "step": 1500
207
+ },
208
+ {
209
+ "epoch": 1.21,
210
+ "eval_bleu": 0.28584924145284585,
211
+ "eval_loss": 4.1763997077941895,
212
+ "eval_runtime": 1912.3927,
213
+ "eval_samples_per_second": 4.606,
214
+ "eval_steps_per_second": 0.144,
215
+ "step": 1500
216
+ },
217
+ {
218
+ "epoch": 1.25,
219
+ "learning_rate": 0.00014999895016087306,
220
+ "loss": 4.1576,
221
+ "step": 1550
222
+ },
223
+ {
224
+ "epoch": 1.29,
225
+ "learning_rate": 0.00014999562388001478,
226
+ "loss": 4.1006,
227
+ "step": 1600
228
+ },
229
+ {
230
+ "epoch": 1.33,
231
+ "learning_rate": 0.00014999001943582685,
232
+ "loss": 4.1227,
233
+ "step": 1650
234
+ },
235
+ {
236
+ "epoch": 1.37,
237
+ "learning_rate": 0.0001499821369985571,
238
+ "loss": 4.1087,
239
+ "step": 1700
240
+ },
241
+ {
242
+ "epoch": 1.41,
243
+ "learning_rate": 0.0001499719768076526,
244
+ "loss": 4.0728,
245
+ "step": 1750
246
+ },
247
+ {
248
+ "epoch": 1.45,
249
+ "learning_rate": 0.00014995953917175227,
250
+ "loss": 4.0705,
251
+ "step": 1800
252
+ },
253
+ {
254
+ "epoch": 1.49,
255
+ "learning_rate": 0.00014994482446867774,
256
+ "loss": 4.0425,
257
+ "step": 1850
258
+ },
259
+ {
260
+ "epoch": 1.53,
261
+ "learning_rate": 0.00014992783314542174,
262
+ "loss": 4.0156,
263
+ "step": 1900
264
+ },
265
+ {
266
+ "epoch": 1.57,
267
+ "learning_rate": 0.00014990856571813448,
268
+ "loss": 4.0317,
269
+ "step": 1950
270
+ },
271
+ {
272
+ "epoch": 1.62,
273
+ "learning_rate": 0.0001498870227721081,
274
+ "loss": 3.9819,
275
+ "step": 2000
276
+ },
277
+ {
278
+ "epoch": 1.62,
279
+ "eval_bleu": 0.39689971475030256,
280
+ "eval_loss": 3.9646406173706055,
281
+ "eval_runtime": 1910.9822,
282
+ "eval_samples_per_second": 4.609,
283
+ "eval_steps_per_second": 0.144,
284
+ "step": 2000
285
+ },
286
+ {
287
+ "epoch": 1.66,
288
+ "learning_rate": 0.0001498632049617587,
289
+ "loss": 3.9671,
290
+ "step": 2050
291
+ },
292
+ {
293
+ "epoch": 1.7,
294
+ "learning_rate": 0.00014983711301060673,
295
+ "loss": 3.9195,
296
+ "step": 2100
297
+ },
298
+ {
299
+ "epoch": 1.74,
300
+ "learning_rate": 0.00014980874771125478,
301
+ "loss": 3.9144,
302
+ "step": 2150
303
+ },
304
+ {
305
+ "epoch": 1.78,
306
+ "learning_rate": 0.00014977810992536359,
307
+ "loss": 3.9092,
308
+ "step": 2200
309
+ },
310
+ {
311
+ "epoch": 1.82,
312
+ "learning_rate": 0.00014974520058362584,
313
+ "loss": 3.9027,
314
+ "step": 2250
315
+ },
316
+ {
317
+ "epoch": 1.86,
318
+ "learning_rate": 0.00014971002068573793,
319
+ "loss": 3.8732,
320
+ "step": 2300
321
+ },
322
+ {
323
+ "epoch": 1.9,
324
+ "learning_rate": 0.00014967257130036961,
325
+ "loss": 3.8439,
326
+ "step": 2350
327
+ },
328
+ {
329
+ "epoch": 1.94,
330
+ "learning_rate": 0.00014963285356513152,
331
+ "loss": 3.8375,
332
+ "step": 2400
333
+ },
334
+ {
335
+ "epoch": 1.98,
336
+ "learning_rate": 0.0001495908686865405,
337
+ "loss": 3.8019,
338
+ "step": 2450
339
+ },
340
+ {
341
+ "epoch": 2.02,
342
+ "learning_rate": 0.00014954661793998317,
343
+ "loss": 3.7382,
344
+ "step": 2500
345
+ },
346
+ {
347
+ "epoch": 2.02,
348
+ "eval_bleu": 0.5913530218314237,
349
+ "eval_loss": 3.787797212600708,
350
+ "eval_runtime": 1914.1909,
351
+ "eval_samples_per_second": 4.601,
352
+ "eval_steps_per_second": 0.144,
353
+ "step": 2500
354
+ },
355
+ {
356
+ "epoch": 2.06,
357
+ "learning_rate": 0.0001495001026696769,
358
+ "loss": 3.5496,
359
+ "step": 2550
360
+ },
361
+ {
362
+ "epoch": 2.1,
363
+ "learning_rate": 0.00014945132428862936,
364
+ "loss": 3.5282,
365
+ "step": 2600
366
+ },
367
+ {
368
+ "epoch": 2.14,
369
+ "learning_rate": 0.00014940028427859524,
370
+ "loss": 3.5239,
371
+ "step": 2650
372
+ },
373
+ {
374
+ "epoch": 2.18,
375
+ "learning_rate": 0.00014934698419003133,
376
+ "loss": 3.4925,
377
+ "step": 2700
378
+ },
379
+ {
380
+ "epoch": 2.22,
381
+ "learning_rate": 0.0001492914256420496,
382
+ "loss": 3.5009,
383
+ "step": 2750
384
+ },
385
+ {
386
+ "epoch": 2.26,
387
+ "learning_rate": 0.00014923361032236776,
388
+ "loss": 3.4963,
389
+ "step": 2800
390
+ },
391
+ {
392
+ "epoch": 2.3,
393
+ "learning_rate": 0.00014917353998725823,
394
+ "loss": 3.491,
395
+ "step": 2850
396
+ },
397
+ {
398
+ "epoch": 2.34,
399
+ "learning_rate": 0.00014911121646149456,
400
+ "loss": 3.4631,
401
+ "step": 2900
402
+ },
403
+ {
404
+ "epoch": 2.38,
405
+ "learning_rate": 0.00014904664163829616,
406
+ "loss": 3.4539,
407
+ "step": 2950
408
+ },
409
+ {
410
+ "epoch": 2.42,
411
+ "learning_rate": 0.00014897981747927076,
412
+ "loss": 3.4567,
413
+ "step": 3000
414
+ },
415
+ {
416
+ "epoch": 2.42,
417
+ "eval_bleu": 0.7641516646931262,
418
+ "eval_loss": 3.635390520095825,
419
+ "eval_runtime": 1917.6351,
420
+ "eval_samples_per_second": 4.593,
421
+ "eval_steps_per_second": 0.144,
422
+ "step": 3000
423
+ },
424
+ {
425
+ "epoch": 2.46,
426
+ "learning_rate": 0.00014891074601435482,
427
+ "loss": 3.4458,
428
+ "step": 3050
429
+ },
430
+ {
431
+ "epoch": 2.5,
432
+ "learning_rate": 0.00014883942934175177,
433
+ "loss": 3.4317,
434
+ "step": 3100
435
+ },
436
+ {
437
+ "epoch": 2.54,
438
+ "learning_rate": 0.0001487658696278684,
439
+ "loss": 3.4017,
440
+ "step": 3150
441
+ },
442
+ {
443
+ "epoch": 2.58,
444
+ "learning_rate": 0.000148690069107249,
445
+ "loss": 3.4068,
446
+ "step": 3200
447
+ },
448
+ {
449
+ "epoch": 2.62,
450
+ "learning_rate": 0.00014861203008250745,
451
+ "loss": 3.3839,
452
+ "step": 3250
453
+ },
454
+ {
455
+ "epoch": 2.67,
456
+ "learning_rate": 0.0001485317549242574,
457
+ "loss": 3.4197,
458
+ "step": 3300
459
+ },
460
+ {
461
+ "epoch": 2.71,
462
+ "learning_rate": 0.00014844924607104,
463
+ "loss": 3.3836,
464
+ "step": 3350
465
+ },
466
+ {
467
+ "epoch": 2.75,
468
+ "learning_rate": 0.00014836450602925014,
469
+ "loss": 3.3611,
470
+ "step": 3400
471
+ },
472
+ {
473
+ "epoch": 2.79,
474
+ "learning_rate": 0.00014827753737306008,
475
+ "loss": 3.3541,
476
+ "step": 3450
477
+ },
478
+ {
479
+ "epoch": 2.83,
480
+ "learning_rate": 0.00014818834274434134,
481
+ "loss": 3.3587,
482
+ "step": 3500
483
+ },
484
+ {
485
+ "epoch": 2.83,
486
+ "eval_bleu": 1.1491131179238592,
487
+ "eval_loss": 3.5020267963409424,
488
+ "eval_runtime": 1914.8223,
489
+ "eval_samples_per_second": 4.6,
490
+ "eval_steps_per_second": 0.144,
491
+ "step": 3500
492
+ },
493
+ {
494
+ "epoch": 2.87,
495
+ "learning_rate": 0.00014809692485258445,
496
+ "loss": 3.3124,
497
+ "step": 3550
498
+ },
499
+ {
500
+ "epoch": 2.91,
501
+ "learning_rate": 0.00014800328647481662,
502
+ "loss": 3.3311,
503
+ "step": 3600
504
+ },
505
+ {
506
+ "epoch": 2.95,
507
+ "learning_rate": 0.00014790743045551744,
508
+ "loss": 3.302,
509
+ "step": 3650
510
+ },
511
+ {
512
+ "epoch": 2.99,
513
+ "learning_rate": 0.00014780935970653235,
514
+ "loss": 3.2845,
515
+ "step": 3700
516
+ },
517
+ {
518
+ "epoch": 3.03,
519
+ "learning_rate": 0.00014770907720698426,
520
+ "loss": 3.165,
521
+ "step": 3750
522
+ },
523
+ {
524
+ "epoch": 3.07,
525
+ "learning_rate": 0.00014760658600318318,
526
+ "loss": 3.0311,
527
+ "step": 3800
528
+ },
529
+ {
530
+ "epoch": 3.11,
531
+ "learning_rate": 0.00014750188920853338,
532
+ "loss": 3.0383,
533
+ "step": 3850
534
+ },
535
+ {
536
+ "epoch": 3.15,
537
+ "learning_rate": 0.00014739499000343914,
538
+ "loss": 3.0399,
539
+ "step": 3900
540
+ },
541
+ {
542
+ "epoch": 3.19,
543
+ "learning_rate": 0.0001472858916352079,
544
+ "loss": 3.0327,
545
+ "step": 3950
546
+ },
547
+ {
548
+ "epoch": 3.23,
549
+ "learning_rate": 0.0001471745974179517,
550
+ "loss": 3.007,
551
+ "step": 4000
552
+ },
553
+ {
554
+ "epoch": 3.23,
555
+ "eval_bleu": 1.106589199275075,
556
+ "eval_loss": 3.435346841812134,
557
+ "eval_runtime": 1918.0445,
558
+ "eval_samples_per_second": 4.592,
559
+ "eval_steps_per_second": 0.144,
560
+ "step": 4000
561
+ },
562
+ {
563
+ "epoch": 3.27,
564
+ "learning_rate": 0.00014706111073248656,
565
+ "loss": 3.0285,
566
+ "step": 4050
567
+ },
568
+ {
569
+ "epoch": 3.31,
570
+ "learning_rate": 0.0001469454350262297,
571
+ "loss": 3.0335,
572
+ "step": 4100
573
+ },
574
+ {
575
+ "epoch": 3.35,
576
+ "learning_rate": 0.0001468275738130948,
577
+ "loss": 3.0331,
578
+ "step": 4150
579
+ },
580
+ {
581
+ "epoch": 3.39,
582
+ "learning_rate": 0.0001467075306733854,
583
+ "loss": 3.024,
584
+ "step": 4200
585
+ },
586
+ {
587
+ "epoch": 3.43,
588
+ "learning_rate": 0.000146585309253686,
589
+ "loss": 3.0199,
590
+ "step": 4250
591
+ },
592
+ {
593
+ "epoch": 3.47,
594
+ "learning_rate": 0.00014646091326675126,
595
+ "loss": 2.9867,
596
+ "step": 4300
597
+ },
598
+ {
599
+ "epoch": 3.51,
600
+ "learning_rate": 0.00014633434649139344,
601
+ "loss": 3.0297,
602
+ "step": 4350
603
+ },
604
+ {
605
+ "epoch": 3.55,
606
+ "learning_rate": 0.00014620561277236722,
607
+ "loss": 2.9971,
608
+ "step": 4400
609
+ },
610
+ {
611
+ "epoch": 3.59,
612
+ "learning_rate": 0.0001460747160202534,
613
+ "loss": 2.9872,
614
+ "step": 4450
615
+ },
616
+ {
617
+ "epoch": 3.63,
618
+ "learning_rate": 0.0001459416602113397,
619
+ "loss": 2.9901,
620
+ "step": 4500
621
+ },
622
+ {
623
+ "epoch": 3.63,
624
+ "eval_bleu": 1.4837165956404195,
625
+ "eval_loss": 3.3302574157714844,
626
+ "eval_runtime": 1928.4326,
627
+ "eval_samples_per_second": 4.567,
628
+ "eval_steps_per_second": 0.143,
629
+ "step": 4500
630
+ },
631
+ {
632
+ "epoch": 3.67,
633
+ "learning_rate": 0.00014580644938750012,
634
+ "loss": 2.9842,
635
+ "step": 4550
636
+ },
637
+ {
638
+ "epoch": 3.72,
639
+ "learning_rate": 0.00014566908765607222,
640
+ "loss": 2.9818,
641
+ "step": 4600
642
+ },
643
+ {
644
+ "epoch": 3.76,
645
+ "learning_rate": 0.00014552957918973226,
646
+ "loss": 2.9784,
647
+ "step": 4650
648
+ },
649
+ {
650
+ "epoch": 3.8,
651
+ "learning_rate": 0.00014538792822636849,
652
+ "loss": 2.974,
653
+ "step": 4700
654
+ },
655
+ {
656
+ "epoch": 3.84,
657
+ "learning_rate": 0.00014524413906895234,
658
+ "loss": 2.9592,
659
+ "step": 4750
660
+ },
661
+ {
662
+ "epoch": 3.88,
663
+ "learning_rate": 0.00014509821608540784,
664
+ "loss": 2.9519,
665
+ "step": 4800
666
+ },
667
+ {
668
+ "epoch": 3.92,
669
+ "learning_rate": 0.00014495016370847882,
670
+ "loss": 2.9672,
671
+ "step": 4850
672
+ },
673
+ {
674
+ "epoch": 3.96,
675
+ "learning_rate": 0.00014479998643559435,
676
+ "loss": 2.944,
677
+ "step": 4900
678
+ },
679
+ {
680
+ "epoch": 4.0,
681
+ "learning_rate": 0.00014464768882873198,
682
+ "loss": 2.9653,
683
+ "step": 4950
684
+ },
685
+ {
686
+ "epoch": 4.04,
687
+ "learning_rate": 0.00014449327551427935,
688
+ "loss": 2.7312,
689
+ "step": 5000
690
+ },
691
+ {
692
+ "epoch": 4.04,
693
+ "eval_bleu": 1.3796115530850706,
694
+ "eval_loss": 3.2999939918518066,
695
+ "eval_runtime": 1915.5145,
696
+ "eval_samples_per_second": 4.598,
697
+ "eval_steps_per_second": 0.144,
698
+ "step": 5000
699
+ },
700
+ {
701
+ "epoch": 4.08,
702
+ "learning_rate": 0.0001443367511828934,
703
+ "loss": 2.6928,
704
+ "step": 5050
705
+ },
706
+ {
707
+ "epoch": 4.12,
708
+ "learning_rate": 0.0001441781205893582,
709
+ "loss": 2.6899,
710
+ "step": 5100
711
+ },
712
+ {
713
+ "epoch": 4.16,
714
+ "learning_rate": 0.00014401738855244028,
715
+ "loss": 2.6697,
716
+ "step": 5150
717
+ },
718
+ {
719
+ "epoch": 4.2,
720
+ "learning_rate": 0.00014385455995474222,
721
+ "loss": 2.6839,
722
+ "step": 5200
723
+ },
724
+ {
725
+ "epoch": 4.24,
726
+ "learning_rate": 0.00014368963974255454,
727
+ "loss": 2.6911,
728
+ "step": 5250
729
+ },
730
+ {
731
+ "epoch": 4.28,
732
+ "learning_rate": 0.0001435226329257053,
733
+ "loss": 2.7043,
734
+ "step": 5300
735
+ },
736
+ {
737
+ "epoch": 4.32,
738
+ "learning_rate": 0.00014335354457740792,
739
+ "loss": 2.6771,
740
+ "step": 5350
741
+ },
742
+ {
743
+ "epoch": 4.36,
744
+ "learning_rate": 0.00014318237983410706,
745
+ "loss": 2.6987,
746
+ "step": 5400
747
+ },
748
+ {
749
+ "epoch": 4.4,
750
+ "learning_rate": 0.0001430091438953227,
751
+ "loss": 2.6854,
752
+ "step": 5450
753
+ },
754
+ {
755
+ "epoch": 4.44,
756
+ "learning_rate": 0.00014283384202349203,
757
+ "loss": 2.6822,
758
+ "step": 5500
759
+ },
760
+ {
761
+ "epoch": 4.44,
762
+ "eval_bleu": 1.572420203309693,
763
+ "eval_loss": 3.2495412826538086,
764
+ "eval_runtime": 1922.8389,
765
+ "eval_samples_per_second": 4.581,
766
+ "eval_steps_per_second": 0.144,
767
+ "step": 5500
768
+ },
769
+ {
770
+ "epoch": 4.48,
771
+ "learning_rate": 0.00014265647954380976,
772
+ "loss": 2.7064,
773
+ "step": 5550
774
+ },
775
+ {
776
+ "epoch": 4.52,
777
+ "learning_rate": 0.00014247706184406618,
778
+ "loss": 2.6905,
779
+ "step": 5600
780
+ },
781
+ {
782
+ "epoch": 4.56,
783
+ "learning_rate": 0.00014229559437448362,
784
+ "loss": 2.6847,
785
+ "step": 5650
786
+ },
787
+ {
788
+ "epoch": 4.6,
789
+ "learning_rate": 0.00014211208264755092,
790
+ "loss": 2.6825,
791
+ "step": 5700
792
+ },
793
+ {
794
+ "epoch": 4.64,
795
+ "learning_rate": 0.00014192653223785577,
796
+ "loss": 2.6759,
797
+ "step": 5750
798
+ },
799
+ {
800
+ "epoch": 4.68,
801
+ "learning_rate": 0.0001417389487819156,
802
+ "loss": 2.6888,
803
+ "step": 5800
804
+ },
805
+ {
806
+ "epoch": 4.72,
807
+ "learning_rate": 0.00014154933797800621,
808
+ "loss": 2.6722,
809
+ "step": 5850
810
+ },
811
+ {
812
+ "epoch": 4.77,
813
+ "learning_rate": 0.0001413577055859888,
814
+ "loss": 2.688,
815
+ "step": 5900
816
+ },
817
+ {
818
+ "epoch": 4.81,
819
+ "learning_rate": 0.00014116405742713484,
820
+ "loss": 2.6934,
821
+ "step": 5950
822
+ },
823
+ {
824
+ "epoch": 4.85,
825
+ "learning_rate": 0.00014096839938394936,
826
+ "loss": 2.6577,
827
+ "step": 6000
828
+ },
829
+ {
830
+ "epoch": 4.85,
831
+ "eval_bleu": 1.6301048703044616,
832
+ "eval_loss": 3.176309823989868,
833
+ "eval_runtime": 1919.9365,
834
+ "eval_samples_per_second": 4.588,
835
+ "eval_steps_per_second": 0.144,
836
+ "step": 6000
837
+ },
838
+ {
839
+ "epoch": 4.89,
840
+ "learning_rate": 0.00014077073739999222,
841
+ "loss": 2.6749,
842
+ "step": 6050
843
+ },
844
+ {
845
+ "epoch": 4.93,
846
+ "learning_rate": 0.0001405710774796975,
847
+ "loss": 2.6639,
848
+ "step": 6100
849
+ },
850
+ {
851
+ "epoch": 4.97,
852
+ "learning_rate": 0.0001403694256881913,
853
+ "loss": 2.6828,
854
+ "step": 6150
855
+ },
856
+ {
857
+ "epoch": 5.01,
858
+ "learning_rate": 0.00014016578815110716,
859
+ "loss": 2.645,
860
+ "step": 6200
861
+ },
862
+ {
863
+ "epoch": 5.05,
864
+ "learning_rate": 0.00013996017105440036,
865
+ "loss": 2.3837,
866
+ "step": 6250
867
+ },
868
+ {
869
+ "epoch": 5.09,
870
+ "learning_rate": 0.00013975258064415972,
871
+ "loss": 2.3875,
872
+ "step": 6300
873
+ },
874
+ {
875
+ "epoch": 5.13,
876
+ "learning_rate": 0.00013954302322641797,
877
+ "loss": 2.3975,
878
+ "step": 6350
879
+ },
880
+ {
881
+ "epoch": 5.17,
882
+ "learning_rate": 0.00013933150516696024,
883
+ "loss": 2.4332,
884
+ "step": 6400
885
+ },
886
+ {
887
+ "epoch": 5.21,
888
+ "learning_rate": 0.00013911803289113055,
889
+ "loss": 2.3976,
890
+ "step": 6450
891
+ },
892
+ {
893
+ "epoch": 5.25,
894
+ "learning_rate": 0.00013890261288363676,
895
+ "loss": 2.4118,
896
+ "step": 6500
897
+ },
898
+ {
899
+ "epoch": 5.25,
900
+ "eval_bleu": 1.5739999205693251,
901
+ "eval_loss": 3.2138876914978027,
902
+ "eval_runtime": 1926.1281,
903
+ "eval_samples_per_second": 4.573,
904
+ "eval_steps_per_second": 0.143,
905
+ "step": 6500
906
+ },
907
+ {
908
+ "epoch": 5.29,
909
+ "learning_rate": 0.00013868525168835353,
910
+ "loss": 2.4436,
911
+ "step": 6550
912
+ },
913
+ {
914
+ "epoch": 5.33,
915
+ "learning_rate": 0.0001384659559081235,
916
+ "loss": 2.428,
917
+ "step": 6600
918
+ },
919
+ {
920
+ "epoch": 5.37,
921
+ "learning_rate": 0.0001382447322045568,
922
+ "loss": 2.4192,
923
+ "step": 6650
924
+ },
925
+ {
926
+ "epoch": 5.41,
927
+ "learning_rate": 0.0001380215872978285,
928
+ "loss": 2.4337,
929
+ "step": 6700
930
+ },
931
+ {
932
+ "epoch": 5.45,
933
+ "learning_rate": 0.0001377965279664748,
934
+ "loss": 2.4193,
935
+ "step": 6750
936
+ },
937
+ {
938
+ "epoch": 5.49,
939
+ "learning_rate": 0.0001375695610471868,
940
+ "loss": 2.438,
941
+ "step": 6800
942
+ },
943
+ {
944
+ "epoch": 5.53,
945
+ "learning_rate": 0.00013734069343460293,
946
+ "loss": 2.4385,
947
+ "step": 6850
948
+ },
949
+ {
950
+ "epoch": 5.57,
951
+ "learning_rate": 0.0001371099320810995,
952
+ "loss": 2.4556,
953
+ "step": 6900
954
+ },
955
+ {
956
+ "epoch": 5.61,
957
+ "learning_rate": 0.0001368772839965797,
958
+ "loss": 2.4518,
959
+ "step": 6950
960
+ },
961
+ {
962
+ "epoch": 5.65,
963
+ "learning_rate": 0.00013664275624826025,
964
+ "loss": 2.4345,
965
+ "step": 7000
966
+ },
967
+ {
968
+ "epoch": 5.65,
969
+ "eval_bleu": 1.9002943282055358,
970
+ "eval_loss": 3.1631877422332764,
971
+ "eval_runtime": 1923.9555,
972
+ "eval_samples_per_second": 4.578,
973
+ "eval_steps_per_second": 0.143,
974
+ "step": 7000
975
+ },
976
+ {
977
+ "epoch": 5.69,
978
+ "learning_rate": 0.00013640635596045707,
979
+ "loss": 2.4438,
980
+ "step": 7050
981
+ },
982
+ {
983
+ "epoch": 5.73,
984
+ "learning_rate": 0.00013616809031436876,
985
+ "loss": 2.4239,
986
+ "step": 7100
987
+ },
988
+ {
989
+ "epoch": 5.77,
990
+ "learning_rate": 0.0001359279665478584,
991
+ "loss": 2.4293,
992
+ "step": 7150
993
+ },
994
+ {
995
+ "epoch": 5.82,
996
+ "learning_rate": 0.0001356859919552337,
997
+ "loss": 2.4569,
998
+ "step": 7200
999
+ },
1000
+ {
1001
+ "epoch": 5.86,
1002
+ "learning_rate": 0.0001354421738870255,
1003
+ "loss": 2.4437,
1004
+ "step": 7250
1005
+ },
1006
+ {
1007
+ "epoch": 5.9,
1008
+ "learning_rate": 0.00013519651974976433,
1009
+ "loss": 2.4373,
1010
+ "step": 7300
1011
+ },
1012
+ {
1013
+ "epoch": 5.94,
1014
+ "learning_rate": 0.00013494903700575562,
1015
+ "loss": 2.451,
1016
+ "step": 7350
1017
+ },
1018
+ {
1019
+ "epoch": 5.98,
1020
+ "learning_rate": 0.00013469973317285284,
1021
+ "loss": 2.4468,
1022
+ "step": 7400
1023
+ },
1024
+ {
1025
+ "epoch": 6.02,
1026
+ "learning_rate": 0.0001344486158242292,
1027
+ "loss": 2.333,
1028
+ "step": 7450
1029
+ },
1030
+ {
1031
+ "epoch": 6.06,
1032
+ "learning_rate": 0.00013419569258814757,
1033
+ "loss": 2.158,
1034
+ "step": 7500
1035
+ },
1036
+ {
1037
+ "epoch": 6.06,
1038
+ "eval_bleu": 1.8111395024627068,
1039
+ "eval_loss": 3.179304361343384,
1040
+ "eval_runtime": 1921.8485,
1041
+ "eval_samples_per_second": 4.583,
1042
+ "eval_steps_per_second": 0.144,
1043
+ "step": 7500
1044
+ },
1045
+ {
1046
+ "epoch": 6.1,
1047
+ "learning_rate": 0.00013394097114772887,
1048
+ "loss": 2.1613,
1049
+ "step": 7550
1050
+ },
1051
+ {
1052
+ "epoch": 6.14,
1053
+ "learning_rate": 0.00013368445924071844,
1054
+ "loss": 2.1807,
1055
+ "step": 7600
1056
+ },
1057
+ {
1058
+ "epoch": 6.18,
1059
+ "learning_rate": 0.00013342616465925126,
1060
+ "loss": 2.1873,
1061
+ "step": 7650
1062
+ },
1063
+ {
1064
+ "epoch": 6.22,
1065
+ "learning_rate": 0.00013316609524961502,
1066
+ "loss": 2.1955,
1067
+ "step": 7700
1068
+ },
1069
+ {
1070
+ "epoch": 6.26,
1071
+ "learning_rate": 0.00013290425891201196,
1072
+ "loss": 2.2115,
1073
+ "step": 7750
1074
+ },
1075
+ {
1076
+ "epoch": 6.3,
1077
+ "learning_rate": 0.00013264066360031872,
1078
+ "loss": 2.1964,
1079
+ "step": 7800
1080
+ },
1081
+ {
1082
+ "epoch": 6.34,
1083
+ "learning_rate": 0.0001323753173218448,
1084
+ "loss": 2.2045,
1085
+ "step": 7850
1086
+ },
1087
+ {
1088
+ "epoch": 6.38,
1089
+ "learning_rate": 0.00013210822813708936,
1090
+ "loss": 2.1907,
1091
+ "step": 7900
1092
+ },
1093
+ {
1094
+ "epoch": 6.42,
1095
+ "learning_rate": 0.0001318394041594963,
1096
+ "loss": 2.2066,
1097
+ "step": 7950
1098
+ },
1099
+ {
1100
+ "epoch": 6.46,
1101
+ "learning_rate": 0.00013156885355520778,
1102
+ "loss": 2.2051,
1103
+ "step": 8000
1104
+ },
1105
+ {
1106
+ "epoch": 6.46,
1107
+ "eval_bleu": 1.98361849433367,
1108
+ "eval_loss": 3.160503625869751,
1109
+ "eval_runtime": 1922.8791,
1110
+ "eval_samples_per_second": 4.581,
1111
+ "eval_steps_per_second": 0.144,
1112
+ "step": 8000
1113
+ }
1114
+ ],
1115
+ "max_steps": 30000,
1116
+ "num_train_epochs": 25,
1117
+ "total_flos": 4.9816432750591004e+20,
1118
+ "trial_name": null,
1119
+ "trial_params": null
1120
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d60540c0ce430e5b6261166a4050827eb5d7605940364f01a54b5855933773f5
3
+ size 4024
vocab.txt ADDED
The diff for this file is too large to render. See raw diff