jdqqjr commited on
Commit
92a99b7
·
verified ·
1 Parent(s): cc9b18d

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: other
4
+ base_model: jdqqjr/Qwen2.5-0.5B-Open-R1-Distill
5
+ tags:
6
+ - llama-factory
7
+ - full
8
+ - generated_from_trainer
9
+ model-index:
10
+ - name: Qwen2.5-0.5B-Open-R1-Distill-FactThink-SFT
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # Qwen2.5-0.5B-Open-R1-Distill-FactThink-SFT
18
+
19
+ This model is a fine-tuned version of [jdqqjr/Qwen2.5-0.5B-Open-R1-Distill](https://huggingface.co/jdqqjr/Qwen2.5-0.5B-Open-R1-Distill) on the cot_fact_think_sft_train dataset.
20
+
21
+ ## Model description
22
+
23
+ More information needed
24
+
25
+ ## Intended uses & limitations
26
+
27
+ More information needed
28
+
29
+ ## Training and evaluation data
30
+
31
+ More information needed
32
+
33
+ ## Training procedure
34
+
35
+ ### Training hyperparameters
36
+
37
+ The following hyperparameters were used during training:
38
+ - learning_rate: 0.0001
39
+ - train_batch_size: 2
40
+ - eval_batch_size: 8
41
+ - seed: 42
42
+ - distributed_type: multi-GPU
43
+ - num_devices: 4
44
+ - gradient_accumulation_steps: 4
45
+ - total_train_batch_size: 32
46
+ - total_eval_batch_size: 32
47
+ - optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
48
+ - lr_scheduler_type: cosine
49
+ - num_epochs: 6.0
50
+ - mixed_precision_training: Native AMP
51
+
52
+ ### Training results
53
+
54
+
55
+
56
+ ### Framework versions
57
+
58
+ - Transformers 4.47.0
59
+ - Pytorch 2.3.1+cu121
60
+ - Datasets 2.20.0
61
+ - Tokenizers 0.21.0
added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 5.970666666666666,
3
+ "total_flos": 6.092825392526131e+16,
4
+ "train_loss": 3.2993942699436625,
5
+ "train_runtime": 2130.8581,
6
+ "train_samples_per_second": 16.892,
7
+ "train_steps_per_second": 0.527
8
+ }
config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "jdqqjr/Qwen2.5-0.5B-Open-R1-Distill",
3
+ "architectures": [
4
+ "Qwen2ForCausalLM"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 151643,
8
+ "eos_token_id": 151645,
9
+ "hidden_act": "silu",
10
+ "hidden_size": 896,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 4864,
13
+ "max_position_embeddings": 32768,
14
+ "max_window_layers": 21,
15
+ "model_type": "qwen2",
16
+ "num_attention_heads": 14,
17
+ "num_hidden_layers": 24,
18
+ "num_key_value_heads": 2,
19
+ "rms_norm_eps": 1e-06,
20
+ "rope_scaling": null,
21
+ "rope_theta": 1000000.0,
22
+ "sliding_window": null,
23
+ "tie_word_embeddings": true,
24
+ "torch_dtype": "float32",
25
+ "transformers_version": "4.47.0",
26
+ "use_cache": false,
27
+ "use_sliding_window": false,
28
+ "vocab_size": 151936
29
+ }
generation_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "repetition_penalty": 1.1,
10
+ "temperature": 0.7,
11
+ "top_k": 20,
12
+ "top_p": 0.8,
13
+ "transformers_version": "4.47.0"
14
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb2b3bb82c38dbb80b2fdfd7bdcd898060984bbd67d38cbc2fe38517b1790e23
3
+ size 1976163472
special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|im_end|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
3
+ size 11421896
tokenizer_config.json ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "chat_template": "{% set system_message = 'Your role as an assistant involves thoroughly exploring questions through a systematic long thinking process before providing the final precise and accurate solutions. This requires engaging in a comprehensive cycle of analysis, summarizing, exploration, reassessment, reflection, backtracing, and iteration to develop well-considered thinking process. Please structure your response into two main sections: Thought and Solution. In the Thought section, detail your reasoning process using the specified format: <think> {thought with steps separated with \\'\\n\\n\\'} </think> Each step should include detailed considerations such as analysing questions, summarizing relevant findings, brainstorming new ideas, verifying the accuracy of the current steps, refining any errors, and revisiting previous steps. In the Solution section, based on various attempts, explorations, and reflections from the Thought section, systematically present the final solution that you deem correct. The solution should remain a logical, accurate, concise expression style and detail necessary step needed to reach the conclusion, formatted as follows: <answer> {final formatted, precise, and clear solution} </answer> Now, try to solve the following question through the above guidelines:' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|im_start|>system\\n' + system_message + '<|im_end|>\\n' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\\n' + content + '<|im_end|>\\n<|im_start|>assistant\\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>\\n' + '\\n' }}{% endif %}{% endfor %}",
199
+ "clean_up_tokenization_spaces": false,
200
+ "eos_token": "<|im_end|>",
201
+ "errors": "replace",
202
+ "extra_special_tokens": {},
203
+ "model_max_length": 131072,
204
+ "pad_token": "<|im_end|>",
205
+ "padding_side": "right",
206
+ "split_special_tokens": false,
207
+ "tokenizer_class": "Qwen2Tokenizer",
208
+ "unk_token": null
209
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 5.970666666666666,
3
+ "total_flos": 6.092825392526131e+16,
4
+ "train_loss": 3.2993942699436625,
5
+ "train_runtime": 2130.8581,
6
+ "train_samples_per_second": 16.892,
7
+ "train_steps_per_second": 0.527
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,826 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 5.970666666666666,
5
+ "eval_steps": 500,
6
+ "global_step": 1122,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.05333333333333334,
13
+ "grad_norm": 24.72922706604004,
14
+ "learning_rate": 9.999039635236575e-05,
15
+ "loss": 9.3592,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.10666666666666667,
20
+ "grad_norm": 15.090938568115234,
21
+ "learning_rate": 9.99433669591504e-05,
22
+ "loss": 8.1951,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.16,
27
+ "grad_norm": 11.75036334991455,
28
+ "learning_rate": 9.985718470743917e-05,
29
+ "loss": 8.0486,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.21333333333333335,
34
+ "grad_norm": Infinity,
35
+ "learning_rate": 9.974620019358045e-05,
36
+ "loss": 7.9758,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 0.26666666666666666,
41
+ "grad_norm": 11.149462699890137,
42
+ "learning_rate": 9.958583897050699e-05,
43
+ "loss": 8.0914,
44
+ "step": 50
45
+ },
46
+ {
47
+ "epoch": 0.32,
48
+ "grad_norm": 10.258933067321777,
49
+ "learning_rate": 9.93866051712574e-05,
50
+ "loss": 8.068,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 0.37333333333333335,
55
+ "grad_norm": 11.847441673278809,
56
+ "learning_rate": 9.91486549841951e-05,
57
+ "loss": 8.0715,
58
+ "step": 70
59
+ },
60
+ {
61
+ "epoch": 0.4266666666666667,
62
+ "grad_norm": 10.42148208618164,
63
+ "learning_rate": 9.887217494920655e-05,
64
+ "loss": 8.0495,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 0.48,
69
+ "grad_norm": 9.987250328063965,
70
+ "learning_rate": 9.855738181146428e-05,
71
+ "loss": 8.15,
72
+ "step": 90
73
+ },
74
+ {
75
+ "epoch": 0.5333333333333333,
76
+ "grad_norm": 10.14989948272705,
77
+ "learning_rate": 9.820452235151049e-05,
78
+ "loss": 7.9533,
79
+ "step": 100
80
+ },
81
+ {
82
+ "epoch": 0.5866666666666667,
83
+ "grad_norm": 10.510122299194336,
84
+ "learning_rate": 9.781387319179466e-05,
85
+ "loss": 8.0125,
86
+ "step": 110
87
+ },
88
+ {
89
+ "epoch": 0.64,
90
+ "grad_norm": 10.167430877685547,
91
+ "learning_rate": 9.738574057981678e-05,
92
+ "loss": 7.867,
93
+ "step": 120
94
+ },
95
+ {
96
+ "epoch": 0.6933333333333334,
97
+ "grad_norm": 10.005114555358887,
98
+ "learning_rate": 9.69204601480461e-05,
99
+ "loss": 8.0841,
100
+ "step": 130
101
+ },
102
+ {
103
+ "epoch": 0.7466666666666667,
104
+ "grad_norm": 9.920500755310059,
105
+ "learning_rate": 9.641839665080363e-05,
106
+ "loss": 7.8963,
107
+ "step": 140
108
+ },
109
+ {
110
+ "epoch": 0.8,
111
+ "grad_norm": 10.194422721862793,
112
+ "learning_rate": 9.58799436783149e-05,
113
+ "loss": 7.9465,
114
+ "step": 150
115
+ },
116
+ {
117
+ "epoch": 0.8533333333333334,
118
+ "grad_norm": 9.244508743286133,
119
+ "learning_rate": 9.53055233481567e-05,
120
+ "loss": 7.8268,
121
+ "step": 160
122
+ },
123
+ {
124
+ "epoch": 0.9066666666666666,
125
+ "grad_norm": 9.413444519042969,
126
+ "learning_rate": 9.469558597434018e-05,
127
+ "loss": 7.9496,
128
+ "step": 170
129
+ },
130
+ {
131
+ "epoch": 0.96,
132
+ "grad_norm": 9.113428115844727,
133
+ "learning_rate": 9.405060971428923e-05,
134
+ "loss": 7.8761,
135
+ "step": 180
136
+ },
137
+ {
138
+ "epoch": 1.0106666666666666,
139
+ "grad_norm": 10.662834167480469,
140
+ "learning_rate": 9.337110019399149e-05,
141
+ "loss": 7.0447,
142
+ "step": 190
143
+ },
144
+ {
145
+ "epoch": 1.064,
146
+ "grad_norm": 9.262643814086914,
147
+ "learning_rate": 9.265759011161519e-05,
148
+ "loss": 5.604,
149
+ "step": 200
150
+ },
151
+ {
152
+ "epoch": 1.1173333333333333,
153
+ "grad_norm": 10.475074768066406,
154
+ "learning_rate": 9.191063881990308e-05,
155
+ "loss": 5.5011,
156
+ "step": 210
157
+ },
158
+ {
159
+ "epoch": 1.1706666666666667,
160
+ "grad_norm": 9.422979354858398,
161
+ "learning_rate": 9.113083188767057e-05,
162
+ "loss": 5.6094,
163
+ "step": 220
164
+ },
165
+ {
166
+ "epoch": 1.224,
167
+ "grad_norm": 9.824627876281738,
168
+ "learning_rate": 9.03187806407519e-05,
169
+ "loss": 5.5714,
170
+ "step": 230
171
+ },
172
+ {
173
+ "epoch": 1.2773333333333334,
174
+ "grad_norm": 9.943827629089355,
175
+ "learning_rate": 8.947512168275429e-05,
176
+ "loss": 5.6587,
177
+ "step": 240
178
+ },
179
+ {
180
+ "epoch": 1.3306666666666667,
181
+ "grad_norm": 8.791569709777832,
182
+ "learning_rate": 8.86005163959956e-05,
183
+ "loss": 5.7327,
184
+ "step": 250
185
+ },
186
+ {
187
+ "epoch": 1.384,
188
+ "grad_norm": 9.47322940826416,
189
+ "learning_rate": 8.769565042301691e-05,
190
+ "loss": 5.6468,
191
+ "step": 260
192
+ },
193
+ {
194
+ "epoch": 1.4373333333333334,
195
+ "grad_norm": 9.704258918762207,
196
+ "learning_rate": 8.676123312907642e-05,
197
+ "loss": 5.7799,
198
+ "step": 270
199
+ },
200
+ {
201
+ "epoch": 1.4906666666666666,
202
+ "grad_norm": 9.217696189880371,
203
+ "learning_rate": 8.579799704604596e-05,
204
+ "loss": 5.7282,
205
+ "step": 280
206
+ },
207
+ {
208
+ "epoch": 1.544,
209
+ "grad_norm": 9.473422050476074,
210
+ "learning_rate": 8.480669729814635e-05,
211
+ "loss": 5.7854,
212
+ "step": 290
213
+ },
214
+ {
215
+ "epoch": 1.5973333333333333,
216
+ "grad_norm": 9.128384590148926,
217
+ "learning_rate": 8.378811100997123e-05,
218
+ "loss": 5.824,
219
+ "step": 300
220
+ },
221
+ {
222
+ "epoch": 1.6506666666666665,
223
+ "grad_norm": 9.525070190429688,
224
+ "learning_rate": 8.274303669726426e-05,
225
+ "loss": 5.8272,
226
+ "step": 310
227
+ },
228
+ {
229
+ "epoch": 1.704,
230
+ "grad_norm": 9.38967227935791,
231
+ "learning_rate": 8.167229364092647e-05,
232
+ "loss": 5.8388,
233
+ "step": 320
234
+ },
235
+ {
236
+ "epoch": 1.7573333333333334,
237
+ "grad_norm": 9.629057884216309,
238
+ "learning_rate": 8.057672124474508e-05,
239
+ "loss": 5.834,
240
+ "step": 330
241
+ },
242
+ {
243
+ "epoch": 1.8106666666666666,
244
+ "grad_norm": 9.649420738220215,
245
+ "learning_rate": 7.945717837734688e-05,
246
+ "loss": 5.7878,
247
+ "step": 340
248
+ },
249
+ {
250
+ "epoch": 1.8639999999999999,
251
+ "grad_norm": 10.08434009552002,
252
+ "learning_rate": 7.831454269889251e-05,
253
+ "loss": 5.7565,
254
+ "step": 350
255
+ },
256
+ {
257
+ "epoch": 1.9173333333333333,
258
+ "grad_norm": 8.834407806396484,
259
+ "learning_rate": 7.714970997303898e-05,
260
+ "loss": 5.805,
261
+ "step": 360
262
+ },
263
+ {
264
+ "epoch": 1.9706666666666668,
265
+ "grad_norm": 8.648845672607422,
266
+ "learning_rate": 7.596359336471015e-05,
267
+ "loss": 5.7674,
268
+ "step": 370
269
+ },
270
+ {
271
+ "epoch": 2.021333333333333,
272
+ "grad_norm": 9.286345481872559,
273
+ "learning_rate": 7.475712272422557e-05,
274
+ "loss": 4.6739,
275
+ "step": 380
276
+ },
277
+ {
278
+ "epoch": 2.074666666666667,
279
+ "grad_norm": 9.184532165527344,
280
+ "learning_rate": 7.35312438583488e-05,
281
+ "loss": 3.4263,
282
+ "step": 390
283
+ },
284
+ {
285
+ "epoch": 2.128,
286
+ "grad_norm": 9.793543815612793,
287
+ "learning_rate": 7.228691778882693e-05,
288
+ "loss": 3.3865,
289
+ "step": 400
290
+ },
291
+ {
292
+ "epoch": 2.1813333333333333,
293
+ "grad_norm": 9.974711418151855,
294
+ "learning_rate": 7.102511999900213e-05,
295
+ "loss": 3.4448,
296
+ "step": 410
297
+ },
298
+ {
299
+ "epoch": 2.2346666666666666,
300
+ "grad_norm": 9.690023422241211,
301
+ "learning_rate": 6.974683966908641e-05,
302
+ "loss": 3.4652,
303
+ "step": 420
304
+ },
305
+ {
306
+ "epoch": 2.288,
307
+ "grad_norm": 10.495637893676758,
308
+ "learning_rate": 6.84530789006985e-05,
309
+ "loss": 3.4585,
310
+ "step": 430
311
+ },
312
+ {
313
+ "epoch": 2.3413333333333335,
314
+ "grad_norm": 9.875127792358398,
315
+ "learning_rate": 6.714485193127126e-05,
316
+ "loss": 3.4596,
317
+ "step": 440
318
+ },
319
+ {
320
+ "epoch": 2.3946666666666667,
321
+ "grad_norm": 9.386152267456055,
322
+ "learning_rate": 6.582318433894513e-05,
323
+ "loss": 3.4094,
324
+ "step": 450
325
+ },
326
+ {
327
+ "epoch": 2.448,
328
+ "grad_norm": 9.699738502502441,
329
+ "learning_rate": 6.448911223857123e-05,
330
+ "loss": 3.5483,
331
+ "step": 460
332
+ },
333
+ {
334
+ "epoch": 2.501333333333333,
335
+ "grad_norm": 9.54281234741211,
336
+ "learning_rate": 6.314368146945418e-05,
337
+ "loss": 3.4739,
338
+ "step": 470
339
+ },
340
+ {
341
+ "epoch": 2.554666666666667,
342
+ "grad_norm": 10.121294975280762,
343
+ "learning_rate": 6.178794677547137e-05,
344
+ "loss": 3.5686,
345
+ "step": 480
346
+ },
347
+ {
348
+ "epoch": 2.608,
349
+ "grad_norm": 10.000744819641113,
350
+ "learning_rate": 6.0422970978211834e-05,
351
+ "loss": 3.4778,
352
+ "step": 490
353
+ },
354
+ {
355
+ "epoch": 2.6613333333333333,
356
+ "grad_norm": 9.480353355407715,
357
+ "learning_rate": 5.904982414378233e-05,
358
+ "loss": 3.5983,
359
+ "step": 500
360
+ },
361
+ {
362
+ "epoch": 2.7146666666666666,
363
+ "grad_norm": 10.401140213012695,
364
+ "learning_rate": 5.7669582743934284e-05,
365
+ "loss": 3.6008,
366
+ "step": 510
367
+ },
368
+ {
369
+ "epoch": 2.768,
370
+ "grad_norm": 9.911478042602539,
371
+ "learning_rate": 5.628332881216899e-05,
372
+ "loss": 3.5318,
373
+ "step": 520
374
+ },
375
+ {
376
+ "epoch": 2.8213333333333335,
377
+ "grad_norm": 9.741485595703125,
378
+ "learning_rate": 5.4892149095482815e-05,
379
+ "loss": 3.5949,
380
+ "step": 530
381
+ },
382
+ {
383
+ "epoch": 2.8746666666666667,
384
+ "grad_norm": 9.522744178771973,
385
+ "learning_rate": 5.3497134202417096e-05,
386
+ "loss": 3.5776,
387
+ "step": 540
388
+ },
389
+ {
390
+ "epoch": 2.928,
391
+ "grad_norm": 9.862764358520508,
392
+ "learning_rate": 5.209937774808098e-05,
393
+ "loss": 3.5341,
394
+ "step": 550
395
+ },
396
+ {
397
+ "epoch": 2.981333333333333,
398
+ "grad_norm": 9.565387725830078,
399
+ "learning_rate": 5.069997549681718e-05,
400
+ "loss": 3.4962,
401
+ "step": 560
402
+ },
403
+ {
404
+ "epoch": 3.032,
405
+ "grad_norm": 8.7464599609375,
406
+ "learning_rate": 4.930002450318282e-05,
407
+ "loss": 2.2876,
408
+ "step": 570
409
+ },
410
+ {
411
+ "epoch": 3.0853333333333333,
412
+ "grad_norm": 7.954220294952393,
413
+ "learning_rate": 4.790062225191902e-05,
414
+ "loss": 1.6802,
415
+ "step": 580
416
+ },
417
+ {
418
+ "epoch": 3.1386666666666665,
419
+ "grad_norm": 9.074536323547363,
420
+ "learning_rate": 4.650286579758291e-05,
421
+ "loss": 1.6187,
422
+ "step": 590
423
+ },
424
+ {
425
+ "epoch": 3.192,
426
+ "grad_norm": 8.827848434448242,
427
+ "learning_rate": 4.510785090451719e-05,
428
+ "loss": 1.6036,
429
+ "step": 600
430
+ },
431
+ {
432
+ "epoch": 3.2453333333333334,
433
+ "grad_norm": 8.552383422851562,
434
+ "learning_rate": 4.371667118783101e-05,
435
+ "loss": 1.6022,
436
+ "step": 610
437
+ },
438
+ {
439
+ "epoch": 3.2986666666666666,
440
+ "grad_norm": 8.28974437713623,
441
+ "learning_rate": 4.233041725606572e-05,
442
+ "loss": 1.6742,
443
+ "step": 620
444
+ },
445
+ {
446
+ "epoch": 3.352,
447
+ "grad_norm": 10.971680641174316,
448
+ "learning_rate": 4.095017585621767e-05,
449
+ "loss": 1.5802,
450
+ "step": 630
451
+ },
452
+ {
453
+ "epoch": 3.405333333333333,
454
+ "grad_norm": 8.309553146362305,
455
+ "learning_rate": 3.9577029021788164e-05,
456
+ "loss": 1.6889,
457
+ "step": 640
458
+ },
459
+ {
460
+ "epoch": 3.458666666666667,
461
+ "grad_norm": 8.555489540100098,
462
+ "learning_rate": 3.821205322452863e-05,
463
+ "loss": 1.6509,
464
+ "step": 650
465
+ },
466
+ {
467
+ "epoch": 3.512,
468
+ "grad_norm": 9.02466869354248,
469
+ "learning_rate": 3.685631853054583e-05,
470
+ "loss": 1.6118,
471
+ "step": 660
472
+ },
473
+ {
474
+ "epoch": 3.5653333333333332,
475
+ "grad_norm": 8.728509902954102,
476
+ "learning_rate": 3.5510887761428765e-05,
477
+ "loss": 1.6266,
478
+ "step": 670
479
+ },
480
+ {
481
+ "epoch": 3.618666666666667,
482
+ "grad_norm": 8.970052719116211,
483
+ "learning_rate": 3.4176815661054885e-05,
484
+ "loss": 1.6069,
485
+ "step": 680
486
+ },
487
+ {
488
+ "epoch": 3.672,
489
+ "grad_norm": 8.516586303710938,
490
+ "learning_rate": 3.2855148068728756e-05,
491
+ "loss": 1.6659,
492
+ "step": 690
493
+ },
494
+ {
495
+ "epoch": 3.7253333333333334,
496
+ "grad_norm": 8.919830322265625,
497
+ "learning_rate": 3.1546921099301506e-05,
498
+ "loss": 1.6846,
499
+ "step": 700
500
+ },
501
+ {
502
+ "epoch": 3.7786666666666666,
503
+ "grad_norm": 8.541778564453125,
504
+ "learning_rate": 3.0253160330913598e-05,
505
+ "loss": 1.6181,
506
+ "step": 710
507
+ },
508
+ {
509
+ "epoch": 3.832,
510
+ "grad_norm": 8.83139419555664,
511
+ "learning_rate": 2.8974880000997877e-05,
512
+ "loss": 1.5836,
513
+ "step": 720
514
+ },
515
+ {
516
+ "epoch": 3.8853333333333335,
517
+ "grad_norm": 8.658729553222656,
518
+ "learning_rate": 2.771308221117309e-05,
519
+ "loss": 1.7024,
520
+ "step": 730
521
+ },
522
+ {
523
+ "epoch": 3.9386666666666668,
524
+ "grad_norm": 8.480195999145508,
525
+ "learning_rate": 2.6468756141651206e-05,
526
+ "loss": 1.6222,
527
+ "step": 740
528
+ },
529
+ {
530
+ "epoch": 3.992,
531
+ "grad_norm": 8.250679016113281,
532
+ "learning_rate": 2.5242877275774445e-05,
533
+ "loss": 1.6467,
534
+ "step": 750
535
+ },
536
+ {
537
+ "epoch": 4.042666666666666,
538
+ "grad_norm": 5.594525337219238,
539
+ "learning_rate": 2.403640663528986e-05,
540
+ "loss": 0.7207,
541
+ "step": 760
542
+ },
543
+ {
544
+ "epoch": 4.096,
545
+ "grad_norm": 5.844119071960449,
546
+ "learning_rate": 2.285029002696103e-05,
547
+ "loss": 0.6291,
548
+ "step": 770
549
+ },
550
+ {
551
+ "epoch": 4.149333333333334,
552
+ "grad_norm": 6.574643135070801,
553
+ "learning_rate": 2.1685457301107504e-05,
554
+ "loss": 0.5735,
555
+ "step": 780
556
+ },
557
+ {
558
+ "epoch": 4.2026666666666666,
559
+ "grad_norm": 5.774216651916504,
560
+ "learning_rate": 2.054282162265313e-05,
561
+ "loss": 0.5888,
562
+ "step": 790
563
+ },
564
+ {
565
+ "epoch": 4.256,
566
+ "grad_norm": 5.600583553314209,
567
+ "learning_rate": 1.9423278755254932e-05,
568
+ "loss": 0.5797,
569
+ "step": 800
570
+ },
571
+ {
572
+ "epoch": 4.309333333333333,
573
+ "grad_norm": 6.131619453430176,
574
+ "learning_rate": 1.8327706359073526e-05,
575
+ "loss": 0.6167,
576
+ "step": 810
577
+ },
578
+ {
579
+ "epoch": 4.362666666666667,
580
+ "grad_norm": 5.416571140289307,
581
+ "learning_rate": 1.725696330273575e-05,
582
+ "loss": 0.5787,
583
+ "step": 820
584
+ },
585
+ {
586
+ "epoch": 4.416,
587
+ "grad_norm": 5.745408058166504,
588
+ "learning_rate": 1.6211888990028785e-05,
589
+ "loss": 0.5982,
590
+ "step": 830
591
+ },
592
+ {
593
+ "epoch": 4.469333333333333,
594
+ "grad_norm": 6.464406967163086,
595
+ "learning_rate": 1.5193302701853673e-05,
596
+ "loss": 0.5994,
597
+ "step": 840
598
+ },
599
+ {
600
+ "epoch": 4.522666666666667,
601
+ "grad_norm": 5.810715198516846,
602
+ "learning_rate": 1.4202002953954041e-05,
603
+ "loss": 0.5936,
604
+ "step": 850
605
+ },
606
+ {
607
+ "epoch": 4.576,
608
+ "grad_norm": 5.5798516273498535,
609
+ "learning_rate": 1.323876687092359e-05,
610
+ "loss": 0.5807,
611
+ "step": 860
612
+ },
613
+ {
614
+ "epoch": 4.629333333333333,
615
+ "grad_norm": 6.70074462890625,
616
+ "learning_rate": 1.2304349576983093e-05,
617
+ "loss": 0.5784,
618
+ "step": 870
619
+ },
620
+ {
621
+ "epoch": 4.682666666666667,
622
+ "grad_norm": 6.038529396057129,
623
+ "learning_rate": 1.1399483604004402e-05,
624
+ "loss": 0.572,
625
+ "step": 880
626
+ },
627
+ {
628
+ "epoch": 4.736,
629
+ "grad_norm": 5.514012813568115,
630
+ "learning_rate": 1.0524878317245712e-05,
631
+ "loss": 0.5582,
632
+ "step": 890
633
+ },
634
+ {
635
+ "epoch": 4.789333333333333,
636
+ "grad_norm": 5.5417680740356445,
637
+ "learning_rate": 9.681219359248106e-06,
638
+ "loss": 0.5343,
639
+ "step": 900
640
+ },
641
+ {
642
+ "epoch": 4.842666666666666,
643
+ "grad_norm": 5.650195121765137,
644
+ "learning_rate": 8.869168112329441e-06,
645
+ "loss": 0.5305,
646
+ "step": 910
647
+ },
648
+ {
649
+ "epoch": 4.896,
650
+ "grad_norm": 5.692509174346924,
651
+ "learning_rate": 8.089361180096927e-06,
652
+ "loss": 0.5936,
653
+ "step": 920
654
+ },
655
+ {
656
+ "epoch": 4.949333333333334,
657
+ "grad_norm": 5.946127891540527,
658
+ "learning_rate": 7.342409888384816e-06,
659
+ "loss": 0.5516,
660
+ "step": 930
661
+ },
662
+ {
663
+ "epoch": 5.0,
664
+ "grad_norm": 3.735246419906616,
665
+ "learning_rate": 6.628899806008515e-06,
666
+ "loss": 0.5143,
667
+ "step": 940
668
+ },
669
+ {
670
+ "epoch": 5.053333333333334,
671
+ "grad_norm": 3.113004207611084,
672
+ "learning_rate": 5.949390285710776e-06,
673
+ "loss": 0.2498,
674
+ "step": 950
675
+ },
676
+ {
677
+ "epoch": 5.1066666666666665,
678
+ "grad_norm": 4.239569664001465,
679
+ "learning_rate": 5.304414025659832e-06,
680
+ "loss": 0.2392,
681
+ "step": 960
682
+ },
683
+ {
684
+ "epoch": 5.16,
685
+ "grad_norm": 3.16518497467041,
686
+ "learning_rate": 4.694476651843294e-06,
687
+ "loss": 0.2241,
688
+ "step": 970
689
+ },
690
+ {
691
+ "epoch": 5.213333333333333,
692
+ "grad_norm": 3.297945499420166,
693
+ "learning_rate": 4.120056321685101e-06,
694
+ "loss": 0.2332,
695
+ "step": 980
696
+ },
697
+ {
698
+ "epoch": 5.266666666666667,
699
+ "grad_norm": 3.4214251041412354,
700
+ "learning_rate": 3.581603349196372e-06,
701
+ "loss": 0.22,
702
+ "step": 990
703
+ },
704
+ {
705
+ "epoch": 5.32,
706
+ "grad_norm": 2.959951162338257,
707
+ "learning_rate": 3.079539851953911e-06,
708
+ "loss": 0.2192,
709
+ "step": 1000
710
+ },
711
+ {
712
+ "epoch": 5.373333333333333,
713
+ "grad_norm": 2.8886938095092773,
714
+ "learning_rate": 2.614259420183218e-06,
715
+ "loss": 0.2244,
716
+ "step": 1010
717
+ },
718
+ {
719
+ "epoch": 5.426666666666667,
720
+ "grad_norm": 3.0144460201263428,
721
+ "learning_rate": 2.1861268082053466e-06,
722
+ "loss": 0.2234,
723
+ "step": 1020
724
+ },
725
+ {
726
+ "epoch": 5.48,
727
+ "grad_norm": 3.1671485900878906,
728
+ "learning_rate": 1.7954776484895186e-06,
729
+ "loss": 0.2178,
730
+ "step": 1030
731
+ },
732
+ {
733
+ "epoch": 5.533333333333333,
734
+ "grad_norm": 2.926842451095581,
735
+ "learning_rate": 1.4426181885357215e-06,
736
+ "loss": 0.2222,
737
+ "step": 1040
738
+ },
739
+ {
740
+ "epoch": 5.586666666666667,
741
+ "grad_norm": 3.4401443004608154,
742
+ "learning_rate": 1.1278250507934518e-06,
743
+ "loss": 0.2276,
744
+ "step": 1050
745
+ },
746
+ {
747
+ "epoch": 5.64,
748
+ "grad_norm": 3.664684295654297,
749
+ "learning_rate": 8.513450158049108e-07,
750
+ "loss": 0.2316,
751
+ "step": 1060
752
+ },
753
+ {
754
+ "epoch": 5.693333333333333,
755
+ "grad_norm": 3.684296131134033,
756
+ "learning_rate": 6.133948287426028e-07,
757
+ "loss": 0.2237,
758
+ "step": 1070
759
+ },
760
+ {
761
+ "epoch": 5.746666666666667,
762
+ "grad_norm": 3.4764082431793213,
763
+ "learning_rate": 4.141610294930043e-07,
764
+ "loss": 0.2227,
765
+ "step": 1080
766
+ },
767
+ {
768
+ "epoch": 5.8,
769
+ "grad_norm": 2.9150614738464355,
770
+ "learning_rate": 2.537998064195579e-07,
771
+ "loss": 0.2257,
772
+ "step": 1090
773
+ },
774
+ {
775
+ "epoch": 5.8533333333333335,
776
+ "grad_norm": 3.013000011444092,
777
+ "learning_rate": 1.324368739195281e-07,
778
+ "loss": 0.2226,
779
+ "step": 1100
780
+ },
781
+ {
782
+ "epoch": 5.906666666666666,
783
+ "grad_norm": 3.105398654937744,
784
+ "learning_rate": 5.016737387085191e-08,
785
+ "loss": 0.2171,
786
+ "step": 1110
787
+ },
788
+ {
789
+ "epoch": 5.96,
790
+ "grad_norm": 3.3148136138916016,
791
+ "learning_rate": 7.055801046113031e-09,
792
+ "loss": 0.2126,
793
+ "step": 1120
794
+ },
795
+ {
796
+ "epoch": 5.970666666666666,
797
+ "step": 1122,
798
+ "total_flos": 6.092825392526131e+16,
799
+ "train_loss": 3.2993942699436625,
800
+ "train_runtime": 2130.8581,
801
+ "train_samples_per_second": 16.892,
802
+ "train_steps_per_second": 0.527
803
+ }
804
+ ],
805
+ "logging_steps": 10,
806
+ "max_steps": 1122,
807
+ "num_input_tokens_seen": 0,
808
+ "num_train_epochs": 6,
809
+ "save_steps": 2000,
810
+ "stateful_callbacks": {
811
+ "TrainerControl": {
812
+ "args": {
813
+ "should_epoch_stop": false,
814
+ "should_evaluate": false,
815
+ "should_log": false,
816
+ "should_save": true,
817
+ "should_training_stop": true
818
+ },
819
+ "attributes": {}
820
+ }
821
+ },
822
+ "total_flos": 6.092825392526131e+16,
823
+ "train_batch_size": 2,
824
+ "trial_name": null,
825
+ "trial_params": null
826
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc812bdee3802bdc46f1df3b47b8a934fcf5137c939503e203b3959a393f3bfd
3
+ size 5560
training_loss.png ADDED
vocab.json ADDED
The diff for this file is too large to render. See raw diff