bimabk commited on
Commit
b76b9e2
·
verified ·
1 Parent(s): dedf468

Upload task output 0ace46bc-8f88-4e70-95b9-9502b5a4d1dc

Browse files
config.json CHANGED
@@ -4,11 +4,38 @@
4
  ],
5
  "attention_dropout": 0.0,
6
  "bos_token_id": 151643,
 
7
  "eos_token_id": 151643,
8
  "hidden_act": "silu",
9
  "hidden_size": 896,
10
  "initializer_range": 0.02,
11
  "intermediate_size": 4864,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  "max_position_embeddings": 32768,
13
  "max_window_layers": 24,
14
  "model_type": "qwen2",
@@ -16,9 +43,13 @@
16
  "num_hidden_layers": 24,
17
  "num_key_value_heads": 2,
18
  "rms_norm_eps": 1e-06,
 
 
 
 
19
  "rope_scaling": null,
20
- "rope_theta": 1000000.0,
21
- "sliding_window": 32768,
22
  "tie_word_embeddings": true,
23
  "torch_dtype": "bfloat16",
24
  "transformers_version": "4.51.3",
 
4
  ],
5
  "attention_dropout": 0.0,
6
  "bos_token_id": 151643,
7
+ "dtype": "bfloat16",
8
  "eos_token_id": 151643,
9
  "hidden_act": "silu",
10
  "hidden_size": 896,
11
  "initializer_range": 0.02,
12
  "intermediate_size": 4864,
13
+ "layer_types": [
14
+ "full_attention",
15
+ "full_attention",
16
+ "full_attention",
17
+ "full_attention",
18
+ "full_attention",
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention"
38
+ ],
39
  "max_position_embeddings": 32768,
40
  "max_window_layers": 24,
41
  "model_type": "qwen2",
 
43
  "num_hidden_layers": 24,
44
  "num_key_value_heads": 2,
45
  "rms_norm_eps": 1e-06,
46
+ "rope_parameters": {
47
+ "rope_theta": 1000000.0,
48
+ "rope_type": "default"
49
+ },
50
  "rope_scaling": null,
51
+ "rope_theta": 10000.0,
52
+ "sliding_window": null,
53
  "tie_word_embeddings": true,
54
  "torch_dtype": "bfloat16",
55
  "transformers_version": "4.51.3",
loss.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ 200,1.234911322593689
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e0d813fdcbbe23f03f94a462d9e34860c669bba2b2c2b035d0a4b6e425f7d2ec
3
  size 988097824
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b55eeba4a93cecf11ad953d8b45fdeb11144b309b6e9f9f4f119bf54a5c016b
3
  size 988097824
special_tokens_map.json CHANGED
@@ -1,19 +1,17 @@
1
  {
2
- "additional_special_tokens": [
3
- "<|im_start|>",
4
- "<|im_end|>",
5
- "<|object_ref_start|>",
6
- "<|object_ref_end|>",
7
- "<|box_start|>",
8
- "<|box_end|>",
9
- "<|quad_start|>",
10
- "<|quad_end|>",
11
- "<|vision_start|>",
12
- "<|vision_end|>",
13
- "<|vision_pad|>",
14
- "<|image_pad|>",
15
- "<|video_pad|>"
16
- ],
17
  "eos_token": {
18
  "content": "<|endoftext|>",
19
  "lstrip": false,
 
1
  {
2
+ "<|box_end|>": "<|box_end|>",
3
+ "<|box_start|>": "<|box_start|>",
4
+ "<|im_end|>": "<|im_end|>",
5
+ "<|im_start|>": "<|im_start|>",
6
+ "<|image_pad|>": "<|image_pad|>",
7
+ "<|object_ref_end|>": "<|object_ref_end|>",
8
+ "<|object_ref_start|>": "<|object_ref_start|>",
9
+ "<|quad_end|>": "<|quad_end|>",
10
+ "<|quad_start|>": "<|quad_start|>",
11
+ "<|video_pad|>": "<|video_pad|>",
12
+ "<|vision_end|>": "<|vision_end|>",
13
+ "<|vision_pad|>": "<|vision_pad|>",
14
+ "<|vision_start|>": "<|vision_start|>",
 
 
15
  "eos_token": {
16
  "content": "<|endoftext|>",
17
  "lstrip": false,
tokenizer.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
3
- size 11421896
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8
3
+ size 11421892
tokenizer_config.json CHANGED
@@ -1,5 +1,17 @@
1
  {
2
- "add_bos_token": false,
 
 
 
 
 
 
 
 
 
 
 
 
3
  "add_prefix_space": false,
4
  "added_tokens_decoder": {
5
  "151643": {
@@ -179,27 +191,29 @@
179
  "special": false
180
  }
181
  },
182
- "additional_special_tokens": [
183
- "<|im_start|>",
184
- "<|im_end|>",
185
- "<|object_ref_start|>",
186
- "<|object_ref_end|>",
187
- "<|box_start|>",
188
- "<|box_end|>",
189
- "<|quad_start|>",
190
- "<|quad_end|>",
191
- "<|vision_start|>",
192
- "<|vision_end|>",
193
- "<|vision_pad|>",
194
- "<|image_pad|>",
195
- "<|video_pad|>"
196
- ],
197
  "bos_token": null,
198
  "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
199
  "clean_up_tokenization_spaces": false,
200
  "eos_token": "<|endoftext|>",
201
  "errors": "replace",
202
- "extra_special_tokens": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
  "model_max_length": 131072,
204
  "pad_token": "<|endoftext|>",
205
  "split_special_tokens": false,
 
1
  {
2
+ "<|box_end|>": "<|box_end|>",
3
+ "<|box_start|>": "<|box_start|>",
4
+ "<|im_end|>": "<|im_end|>",
5
+ "<|im_start|>": "<|im_start|>",
6
+ "<|image_pad|>": "<|image_pad|>",
7
+ "<|object_ref_end|>": "<|object_ref_end|>",
8
+ "<|object_ref_start|>": "<|object_ref_start|>",
9
+ "<|quad_end|>": "<|quad_end|>",
10
+ "<|quad_start|>": "<|quad_start|>",
11
+ "<|video_pad|>": "<|video_pad|>",
12
+ "<|vision_end|>": "<|vision_end|>",
13
+ "<|vision_pad|>": "<|vision_pad|>",
14
+ "<|vision_start|>": "<|vision_start|>",
15
  "add_prefix_space": false,
16
  "added_tokens_decoder": {
17
  "151643": {
 
191
  "special": false
192
  }
193
  },
194
+ "backend": "tokenizers",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
  "bos_token": null,
196
  "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
197
  "clean_up_tokenization_spaces": false,
198
  "eos_token": "<|endoftext|>",
199
  "errors": "replace",
200
+ "extra_special_tokens": {
201
+ "<|box_end|>": "<|box_end|>",
202
+ "<|box_start|>": "<|box_start|>",
203
+ "<|im_end|>": "<|im_end|>",
204
+ "<|im_start|>": "<|im_start|>",
205
+ "<|image_pad|>": "<|image_pad|>",
206
+ "<|object_ref_end|>": "<|object_ref_end|>",
207
+ "<|object_ref_start|>": "<|object_ref_start|>",
208
+ "<|quad_end|>": "<|quad_end|>",
209
+ "<|quad_start|>": "<|quad_start|>",
210
+ "<|video_pad|>": "<|video_pad|>",
211
+ "<|vision_end|>": "<|vision_end|>",
212
+ "<|vision_pad|>": "<|vision_pad|>",
213
+ "<|vision_start|>": "<|vision_start|>"
214
+ },
215
+ "is_local": false,
216
+ "local_files_only": false,
217
  "model_max_length": 131072,
218
  "pad_token": "<|endoftext|>",
219
  "split_special_tokens": false,
trainer_state.json ADDED
@@ -0,0 +1,322 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.0,
6
+ "eval_steps": 500,
7
+ "global_step": 200,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.025,
14
+ "grad_norm": 40.75,
15
+ "learning_rate": 1.2215565714285715e-05,
16
+ "loss": 3.6599,
17
+ "step": 5
18
+ },
19
+ {
20
+ "epoch": 0.05,
21
+ "grad_norm": 15.9375,
22
+ "learning_rate": 2.7485022857142857e-05,
23
+ "loss": 2.5999,
24
+ "step": 10
25
+ },
26
+ {
27
+ "epoch": 0.075,
28
+ "grad_norm": 13.6875,
29
+ "learning_rate": 4.2754480000000004e-05,
30
+ "loss": 1.9074,
31
+ "step": 15
32
+ },
33
+ {
34
+ "epoch": 0.1,
35
+ "grad_norm": 8.1875,
36
+ "learning_rate": 5.802393714285714e-05,
37
+ "loss": 1.72,
38
+ "step": 20
39
+ },
40
+ {
41
+ "epoch": 0.125,
42
+ "grad_norm": 7.53125,
43
+ "learning_rate": 7.329339428571428e-05,
44
+ "loss": 1.6114,
45
+ "step": 25
46
+ },
47
+ {
48
+ "epoch": 0.15,
49
+ "grad_norm": 7.59375,
50
+ "learning_rate": 8.856285142857144e-05,
51
+ "loss": 1.6335,
52
+ "step": 30
53
+ },
54
+ {
55
+ "epoch": 0.175,
56
+ "grad_norm": 6.78125,
57
+ "learning_rate": 0.00010383230857142857,
58
+ "loss": 1.5292,
59
+ "step": 35
60
+ },
61
+ {
62
+ "epoch": 0.2,
63
+ "grad_norm": 7.3125,
64
+ "learning_rate": 0.00010688538037339344,
65
+ "loss": 1.5352,
66
+ "step": 40
67
+ },
68
+ {
69
+ "epoch": 0.225,
70
+ "grad_norm": 6.25,
71
+ "learning_rate": 0.0001068820506977537,
72
+ "loss": 1.5399,
73
+ "step": 45
74
+ },
75
+ {
76
+ "epoch": 0.25,
77
+ "grad_norm": 6.25,
78
+ "learning_rate": 0.00010687615995902611,
79
+ "loss": 1.5242,
80
+ "step": 50
81
+ },
82
+ {
83
+ "epoch": 0.275,
84
+ "grad_norm": 5.71875,
85
+ "learning_rate": 0.00010686770853363879,
86
+ "loss": 1.5208,
87
+ "step": 55
88
+ },
89
+ {
90
+ "epoch": 0.3,
91
+ "grad_norm": 6.09375,
92
+ "learning_rate": 0.00010685669696165211,
93
+ "loss": 1.5109,
94
+ "step": 60
95
+ },
96
+ {
97
+ "epoch": 0.325,
98
+ "grad_norm": 5.8125,
99
+ "learning_rate": 0.0001068431259467241,
100
+ "loss": 1.4876,
101
+ "step": 65
102
+ },
103
+ {
104
+ "epoch": 0.35,
105
+ "grad_norm": 6.125,
106
+ "learning_rate": 0.00010682699635606553,
107
+ "loss": 1.4368,
108
+ "step": 70
109
+ },
110
+ {
111
+ "epoch": 0.375,
112
+ "grad_norm": 5.34375,
113
+ "learning_rate": 0.0001068083092203845,
114
+ "loss": 1.4979,
115
+ "step": 75
116
+ },
117
+ {
118
+ "epoch": 0.4,
119
+ "grad_norm": 5.5625,
120
+ "learning_rate": 0.00010678706573382047,
121
+ "loss": 1.4688,
122
+ "step": 80
123
+ },
124
+ {
125
+ "epoch": 0.425,
126
+ "grad_norm": 5.71875,
127
+ "learning_rate": 0.00010676326725386813,
128
+ "loss": 1.4732,
129
+ "step": 85
130
+ },
131
+ {
132
+ "epoch": 0.45,
133
+ "grad_norm": 5.65625,
134
+ "learning_rate": 0.00010673691530129053,
135
+ "loss": 1.4505,
136
+ "step": 90
137
+ },
138
+ {
139
+ "epoch": 0.475,
140
+ "grad_norm": 6.34375,
141
+ "learning_rate": 0.00010670801156002194,
142
+ "loss": 1.4747,
143
+ "step": 95
144
+ },
145
+ {
146
+ "epoch": 0.5,
147
+ "grad_norm": 5.3125,
148
+ "learning_rate": 0.00010667655787706019,
149
+ "loss": 1.4145,
150
+ "step": 100
151
+ },
152
+ {
153
+ "epoch": 0.525,
154
+ "grad_norm": 5.0,
155
+ "learning_rate": 0.00010664255626234872,
156
+ "loss": 1.4288,
157
+ "step": 105
158
+ },
159
+ {
160
+ "epoch": 0.55,
161
+ "grad_norm": 5.03125,
162
+ "learning_rate": 0.00010660600888864813,
163
+ "loss": 1.431,
164
+ "step": 110
165
+ },
166
+ {
167
+ "epoch": 0.575,
168
+ "grad_norm": 5.0,
169
+ "learning_rate": 0.00010656691809139731,
170
+ "loss": 1.406,
171
+ "step": 115
172
+ },
173
+ {
174
+ "epoch": 0.6,
175
+ "grad_norm": 4.71875,
176
+ "learning_rate": 0.00010652528636856418,
177
+ "loss": 1.4174,
178
+ "step": 120
179
+ },
180
+ {
181
+ "epoch": 0.625,
182
+ "grad_norm": 5.375,
183
+ "learning_rate": 0.00010648111638048613,
184
+ "loss": 1.3684,
185
+ "step": 125
186
+ },
187
+ {
188
+ "epoch": 0.65,
189
+ "grad_norm": 4.9375,
190
+ "learning_rate": 0.00010643441094969993,
191
+ "loss": 1.4044,
192
+ "step": 130
193
+ },
194
+ {
195
+ "epoch": 0.675,
196
+ "grad_norm": 5.1875,
197
+ "learning_rate": 0.0001063851730607615,
198
+ "loss": 1.3708,
199
+ "step": 135
200
+ },
201
+ {
202
+ "epoch": 0.7,
203
+ "grad_norm": 4.75,
204
+ "learning_rate": 0.00010633340586005505,
205
+ "loss": 1.3198,
206
+ "step": 140
207
+ },
208
+ {
209
+ "epoch": 0.725,
210
+ "grad_norm": 5.375,
211
+ "learning_rate": 0.00010627911265559208,
212
+ "loss": 1.4137,
213
+ "step": 145
214
+ },
215
+ {
216
+ "epoch": 0.75,
217
+ "grad_norm": 5.15625,
218
+ "learning_rate": 0.00010622229691680005,
219
+ "loss": 1.3347,
220
+ "step": 150
221
+ },
222
+ {
223
+ "epoch": 0.775,
224
+ "grad_norm": 4.6875,
225
+ "learning_rate": 0.00010616296227430056,
226
+ "loss": 1.338,
227
+ "step": 155
228
+ },
229
+ {
230
+ "epoch": 0.8,
231
+ "grad_norm": 5.0625,
232
+ "learning_rate": 0.00010610111251967746,
233
+ "loss": 1.3591,
234
+ "step": 160
235
+ },
236
+ {
237
+ "epoch": 0.825,
238
+ "grad_norm": 5.0,
239
+ "learning_rate": 0.00010603675160523444,
240
+ "loss": 1.3449,
241
+ "step": 165
242
+ },
243
+ {
244
+ "epoch": 0.85,
245
+ "grad_norm": 5.15625,
246
+ "learning_rate": 0.00010596988364374265,
247
+ "loss": 1.3247,
248
+ "step": 170
249
+ },
250
+ {
251
+ "epoch": 0.875,
252
+ "grad_norm": 6.0,
253
+ "learning_rate": 0.00010590051290817767,
254
+ "loss": 1.3147,
255
+ "step": 175
256
+ },
257
+ {
258
+ "epoch": 0.9,
259
+ "grad_norm": 4.5625,
260
+ "learning_rate": 0.00010582864383144663,
261
+ "loss": 1.3264,
262
+ "step": 180
263
+ },
264
+ {
265
+ "epoch": 0.925,
266
+ "grad_norm": 4.71875,
267
+ "learning_rate": 0.00010575428100610488,
268
+ "loss": 1.2702,
269
+ "step": 185
270
+ },
271
+ {
272
+ "epoch": 0.95,
273
+ "grad_norm": 4.46875,
274
+ "learning_rate": 0.00010567742918406246,
275
+ "loss": 1.3076,
276
+ "step": 190
277
+ },
278
+ {
279
+ "epoch": 0.975,
280
+ "grad_norm": 4.46875,
281
+ "learning_rate": 0.0001055980932762806,
282
+ "loss": 1.2769,
283
+ "step": 195
284
+ },
285
+ {
286
+ "epoch": 1.0,
287
+ "grad_norm": 4.59375,
288
+ "learning_rate": 0.00010551627835245768,
289
+ "loss": 1.2735,
290
+ "step": 200
291
+ },
292
+ {
293
+ "epoch": 1.0,
294
+ "eval_loss": 1.234911322593689,
295
+ "eval_runtime": 0.3775,
296
+ "eval_samples_per_second": 7.948,
297
+ "eval_steps_per_second": 7.948,
298
+ "step": 200
299
+ }
300
+ ],
301
+ "logging_steps": 5,
302
+ "max_steps": 2000,
303
+ "num_input_tokens_seen": 0,
304
+ "num_train_epochs": 10,
305
+ "save_steps": 500,
306
+ "stateful_callbacks": {
307
+ "TrainerControl": {
308
+ "args": {
309
+ "should_epoch_stop": false,
310
+ "should_evaluate": false,
311
+ "should_log": false,
312
+ "should_save": true,
313
+ "should_training_stop": false
314
+ },
315
+ "attributes": {}
316
+ }
317
+ },
318
+ "total_flos": 2.46279712014336e+17,
319
+ "train_batch_size": 140,
320
+ "trial_name": null,
321
+ "trial_params": null
322
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9335070a47dedbb24e52c226990e18916ebe5bfee099d3faddf68095a47d9160
3
+ size 5688