starlineventures commited on
Commit
c7d6b26
·
verified ·
1 Parent(s): 231f780

starlineventures/pilot-talk

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,57 +1,56 @@
1
  ---
2
  base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
3
- library_name: peft
4
- license: mit
5
  tags:
6
- - trl
7
- - sft
8
- - finetuned
9
  - generated_from_trainer
10
- model-index:
11
- - name: outputs
12
- results: []
13
  ---
14
 
15
- <!-- This model card has been generated automatically according to the information the Trainer had access to. You
16
- should probably proofread and complete it, then remove this comment. -->
17
 
18
- # outputs
 
19
 
20
- This model is a fine-tuned version of [deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B) on an unknown dataset.
21
 
22
- ## Model description
 
23
 
24
- More information needed
 
 
 
 
25
 
26
- ## Intended uses & limitations
27
-
28
- More information needed
29
-
30
- ## Training and evaluation data
31
 
32
- More information needed
33
 
34
- ## Training procedure
35
 
36
- ### Training hyperparameters
37
 
38
- The following hyperparameters were used during training:
39
- - learning_rate: 0.0001
40
- - train_batch_size: 4
41
- - eval_batch_size: 16
42
- - seed: 3407
43
- - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
44
- - lr_scheduler_type: linear
45
- - num_epochs: 3
46
 
47
- ### Training results
 
 
 
 
48
 
 
49
 
50
 
51
- ### Framework versions
52
 
53
- - PEFT 0.12.0
54
- - Transformers 4.44.2
55
- - Pytorch 2.4.0+cu121
56
- - Datasets 3.0.0
57
- - Tokenizers 0.19.1
 
 
 
 
 
 
 
 
1
  ---
2
  base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
3
+ library_name: transformers
4
+ model_name: outputs
5
  tags:
 
 
 
6
  - generated_from_trainer
7
+ licence: license
 
 
8
  ---
9
 
10
+ # Model Card for outputs
 
11
 
12
+ This model is a fine-tuned version of [deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B).
13
+ It has been trained using [TRL](https://github.com/huggingface/trl).
14
 
15
+ ## Quick start
16
 
17
+ ```python
18
+ from transformers import pipeline
19
 
20
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
21
+ generator = pipeline("text-generation", model="starlineventures/outputs", device="cuda")
22
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
23
+ print(output["generated_text"])
24
+ ```
25
 
26
+ ## Training procedure
 
 
 
 
27
 
28
+
29
 
 
30
 
31
+ This model was trained with SFT.
32
 
33
+ ### Framework versions
 
 
 
 
 
 
 
34
 
35
+ - TRL: 0.18.1
36
+ - Transformers: 4.52.4
37
+ - Pytorch: 2.7.0
38
+ - Datasets: 3.6.0
39
+ - Tokenizers: 0.21.1
40
 
41
+ ## Citations
42
 
43
 
 
44
 
45
+ Cite TRL as:
46
+
47
+ ```bibtex
48
+ @misc{vonwerra2022trl,
49
+ title = {{TRL: Transformer Reinforcement Learning}},
50
+ author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec},
51
+ year = 2020,
52
+ journal = {GitHub repository},
53
+ publisher = {GitHub},
54
+ howpublished = {\url{https://github.com/huggingface/trl}}
55
+ }
56
+ ```
adapter_config.json CHANGED
@@ -6,6 +6,9 @@
6
  },
7
  "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
8
  "bias": "none",
 
 
 
9
  "fan_in_fan_out": false,
10
  "inference_mode": true,
11
  "init_lora_weights": true,
@@ -14,6 +17,7 @@
14
  "layers_to_transform": null,
15
  "loftq_config": {},
16
  "lora_alpha": 16,
 
17
  "lora_dropout": 0,
18
  "megatron_config": null,
19
  "megatron_core": "megatron.core",
@@ -23,15 +27,16 @@
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
- "down_proj",
27
- "k_proj",
28
- "up_proj",
29
- "q_proj",
30
  "gate_proj",
31
  "v_proj",
 
 
 
 
32
  "o_proj"
33
  ],
34
  "task_type": null,
 
35
  "use_dora": false,
36
  "use_rslora": false
37
  }
 
6
  },
7
  "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
8
  "bias": "none",
9
+ "corda_config": null,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
  "fan_in_fan_out": false,
13
  "inference_mode": true,
14
  "init_lora_weights": true,
 
17
  "layers_to_transform": null,
18
  "loftq_config": {},
19
  "lora_alpha": 16,
20
+ "lora_bias": false,
21
  "lora_dropout": 0,
22
  "megatron_config": null,
23
  "megatron_core": "megatron.core",
 
27
  "rank_pattern": {},
28
  "revision": null,
29
  "target_modules": [
 
 
 
 
30
  "gate_proj",
31
  "v_proj",
32
+ "k_proj",
33
+ "q_proj",
34
+ "up_proj",
35
+ "down_proj",
36
  "o_proj"
37
  ],
38
  "task_type": null,
39
+ "trainable_token_indices": null,
40
  "use_dora": false,
41
  "use_rslora": false
42
  }
all_results.json CHANGED
@@ -1,8 +1,7 @@
1
  {
2
- "epoch": 3.0,
3
  "total_flos": 0.0,
4
- "train_loss": 0.11339402707914512,
5
- "train_runtime": 443.7998,
6
- "train_samples_per_second": 4.326,
7
- "train_steps_per_second": 1.082
8
  }
 
1
  {
 
2
  "total_flos": 0.0,
3
+ "train_loss": 0.09647794626653194,
4
+ "train_runtime": 276.4027,
5
+ "train_samples_per_second": 4.631,
6
+ "train_steps_per_second": 1.158
7
  }
chat_template.jinja ADDED
@@ -0,0 +1 @@
 
 
1
+ {% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|><think>\n'}}{% endif %}
config.json CHANGED
@@ -1,5 +1,4 @@
1
  {
2
- "_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
3
  "architectures": [
4
  "Qwen2ForCausalLM"
5
  ],
@@ -17,11 +16,12 @@
17
  "num_hidden_layers": 28,
18
  "num_key_value_heads": 2,
19
  "rms_norm_eps": 1e-06,
 
20
  "rope_theta": 10000,
21
- "sliding_window": null,
22
  "tie_word_embeddings": false,
23
  "torch_dtype": "bfloat16",
24
- "transformers_version": "4.44.2",
25
  "use_cache": false,
26
  "use_mrope": false,
27
  "use_sliding_window": false,
 
1
  {
 
2
  "architectures": [
3
  "Qwen2ForCausalLM"
4
  ],
 
16
  "num_hidden_layers": 28,
17
  "num_key_value_heads": 2,
18
  "rms_norm_eps": 1e-06,
19
+ "rope_scaling": null,
20
  "rope_theta": 10000,
21
+ "sliding_window": 4096,
22
  "tie_word_embeddings": false,
23
  "torch_dtype": "bfloat16",
24
+ "transformers_version": "4.52.4",
25
  "use_cache": false,
26
  "use_mrope": false,
27
  "use_sliding_window": false,
generation_config.json CHANGED
@@ -5,5 +5,5 @@
5
  "eos_token_id": 151643,
6
  "temperature": 0.6,
7
  "top_p": 0.95,
8
- "transformers_version": "4.44.2"
9
  }
 
5
  "eos_token_id": 151643,
6
  "temperature": 0.6,
7
  "top_p": 0.95,
8
+ "transformers_version": "4.52.4"
9
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ef0da30790a2243077da2841088a57e8b854924ab9dd99a138cac9fb09043fb8
3
  size 3554214752
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25afe2c83203d89e709118ec4867d6c1d40d0479a8296b4f90f5090578885517
3
  size 3554214752
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -181,14 +181,14 @@
181
  }
182
  },
183
  "bos_token": "<|begin▁of▁sentence|>",
184
- "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|><think>\\n'}}{% endif %}",
185
  "clean_up_tokenization_spaces": false,
186
  "eos_token": "<|end▁of▁sentence|>",
 
187
  "legacy": true,
188
  "model_max_length": 2048,
189
  "pad_token": "<|end▁of▁sentence|>",
190
  "sp_model_kwargs": {},
191
- "tokenizer_class": "LlamaTokenizer",
192
  "unk_token": null,
193
  "use_default_system_prompt": false
194
  }
 
181
  }
182
  },
183
  "bos_token": "<|begin▁of▁sentence|>",
 
184
  "clean_up_tokenization_spaces": false,
185
  "eos_token": "<|end▁of▁sentence|>",
186
+ "extra_special_tokens": {},
187
  "legacy": true,
188
  "model_max_length": 2048,
189
  "pad_token": "<|end▁of▁sentence|>",
190
  "sp_model_kwargs": {},
191
+ "tokenizer_class": "LlamaTokenizerFast",
192
  "unk_token": null,
193
  "use_default_system_prompt": false
194
  }
train_results.json CHANGED
@@ -1,8 +1,7 @@
1
  {
2
- "epoch": 3.0,
3
  "total_flos": 0.0,
4
- "train_loss": 0.11339402707914512,
5
- "train_runtime": 443.7998,
6
- "train_samples_per_second": 4.326,
7
- "train_steps_per_second": 1.082
8
  }
 
1
  {
 
2
  "total_flos": 0.0,
3
+ "train_loss": 0.09647794626653194,
4
+ "train_runtime": 276.4027,
5
+ "train_samples_per_second": 4.631,
6
+ "train_steps_per_second": 1.158
7
  }
trainer_state.json CHANGED
@@ -1,391 +1,337 @@
1
  {
 
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 3.0,
5
  "eval_steps": 500,
6
- "global_step": 480,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.0625,
13
- "grad_norm": 0.6764543056488037,
14
- "learning_rate": 9.791666666666667e-05,
15
- "loss": 4.0135,
 
 
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.125,
20
- "grad_norm": 0.2798255681991577,
21
- "learning_rate": 9.583333333333334e-05,
22
- "loss": 0.1079,
 
 
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.1875,
27
- "grad_norm": 0.2010556012392044,
28
- "learning_rate": 9.375e-05,
29
- "loss": 0.0619,
 
 
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.25,
34
- "grad_norm": 0.19590723514556885,
35
- "learning_rate": 9.166666666666667e-05,
36
- "loss": 0.0474,
 
 
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.3125,
41
- "grad_norm": 0.1653718799352646,
42
- "learning_rate": 8.958333333333335e-05,
43
- "loss": 0.0417,
 
 
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.375,
48
- "grad_norm": 0.19019761681556702,
49
- "learning_rate": 8.75e-05,
50
- "loss": 0.037,
 
 
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.4375,
55
- "grad_norm": 0.18890416622161865,
56
- "learning_rate": 8.541666666666666e-05,
57
- "loss": 0.0339,
 
 
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.5,
62
- "grad_norm": 0.19193948805332184,
63
- "learning_rate": 8.333333333333334e-05,
64
- "loss": 0.0317,
 
 
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.5625,
69
- "grad_norm": 0.12777990102767944,
70
- "learning_rate": 8.125000000000001e-05,
71
- "loss": 0.0301,
 
 
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.625,
76
- "grad_norm": 0.12041394412517548,
77
- "learning_rate": 7.916666666666666e-05,
78
- "loss": 0.0287,
 
 
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.6875,
83
- "grad_norm": 0.1440989226102829,
84
- "learning_rate": 7.708333333333334e-05,
85
  "loss": 0.0288,
 
 
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.6875,
90
- "eval_runtime": 12.6929,
91
- "eval_samples_per_second": 12.606,
92
- "eval_steps_per_second": 0.788,
93
  "step": 110
94
  },
95
  {
96
  "epoch": 0.75,
97
- "grad_norm": 0.14094920456409454,
98
- "learning_rate": 7.500000000000001e-05,
99
- "loss": 0.0284,
 
 
100
  "step": 120
101
  },
102
  {
103
  "epoch": 0.8125,
104
- "grad_norm": 0.13362859189510345,
105
- "learning_rate": 7.291666666666667e-05,
106
- "loss": 0.028,
 
 
107
  "step": 130
108
  },
109
  {
110
  "epoch": 0.875,
111
- "grad_norm": 0.10230981558561325,
112
- "learning_rate": 7.083333333333334e-05,
113
- "loss": 0.0278,
 
 
114
  "step": 140
115
  },
116
  {
117
  "epoch": 0.9375,
118
- "grad_norm": 0.10484622418880463,
119
- "learning_rate": 6.875e-05,
120
- "loss": 0.0277,
 
 
121
  "step": 150
122
  },
123
  {
124
  "epoch": 1.0,
125
- "grad_norm": 0.1045694574713707,
126
- "learning_rate": 6.666666666666667e-05,
127
- "loss": 0.0277,
 
 
128
  "step": 160
129
  },
130
  {
131
  "epoch": 1.0,
132
- "eval_runtime": 11.2891,
133
- "eval_samples_per_second": 14.173,
134
- "eval_steps_per_second": 0.886,
135
  "step": 160
136
  },
137
  {
138
  "epoch": 1.0625,
139
- "grad_norm": 0.11217521131038666,
140
- "learning_rate": 6.458333333333334e-05,
141
  "loss": 0.0267,
 
 
142
  "step": 170
143
  },
144
  {
145
  "epoch": 1.125,
146
- "grad_norm": 0.1110721006989479,
147
- "learning_rate": 6.25e-05,
148
  "loss": 0.027,
 
 
149
  "step": 180
150
  },
151
  {
152
  "epoch": 1.1875,
153
- "grad_norm": 0.12359625101089478,
154
- "learning_rate": 6.041666666666667e-05,
155
- "loss": 0.0272,
 
 
156
  "step": 190
157
  },
158
  {
159
  "epoch": 1.25,
160
- "grad_norm": 0.10066195577383041,
161
- "learning_rate": 5.833333333333334e-05,
162
- "loss": 0.027,
 
 
163
  "step": 200
164
  },
165
  {
166
  "epoch": 1.3125,
167
- "grad_norm": 0.10776817798614502,
168
- "learning_rate": 5.6250000000000005e-05,
169
- "loss": 0.0269,
 
 
170
  "step": 210
171
  },
172
  {
173
  "epoch": 1.375,
174
- "grad_norm": 0.10342445224523544,
175
- "learning_rate": 5.4166666666666664e-05,
176
- "loss": 0.027,
 
 
177
  "step": 220
178
  },
179
  {
180
  "epoch": 1.4375,
181
- "grad_norm": 0.0953444242477417,
182
- "learning_rate": 5.208333333333334e-05,
183
- "loss": 0.027,
 
 
184
  "step": 230
185
  },
186
  {
187
  "epoch": 1.5,
188
- "grad_norm": 0.10930311679840088,
189
- "learning_rate": 5e-05,
190
- "loss": 0.0269,
 
 
191
  "step": 240
192
  },
193
  {
194
  "epoch": 1.5625,
195
- "grad_norm": 0.11344899982213974,
196
- "learning_rate": 4.791666666666667e-05,
197
- "loss": 0.0268,
 
 
198
  "step": 250
199
  },
200
  {
201
  "epoch": 1.625,
202
- "grad_norm": 0.10314708203077316,
203
- "learning_rate": 4.5833333333333334e-05,
204
- "loss": 0.0267,
 
 
205
  "step": 260
206
  },
207
  {
208
  "epoch": 1.6875,
209
- "grad_norm": 0.11634312570095062,
210
- "learning_rate": 4.375e-05,
211
- "loss": 0.0266,
 
 
212
  "step": 270
213
  },
214
  {
215
  "epoch": 1.75,
216
- "grad_norm": 0.09650956094264984,
217
- "learning_rate": 4.166666666666667e-05,
218
- "loss": 0.0267,
 
 
219
  "step": 280
220
  },
221
  {
222
  "epoch": 1.8125,
223
- "grad_norm": 0.10790986567735672,
224
- "learning_rate": 3.958333333333333e-05,
225
- "loss": 0.0266,
 
 
226
  "step": 290
227
  },
228
  {
229
  "epoch": 1.875,
230
- "grad_norm": 0.10706381499767303,
231
- "learning_rate": 3.7500000000000003e-05,
232
- "loss": 0.0264,
 
 
233
  "step": 300
234
  },
235
  {
236
  "epoch": 1.9375,
237
- "grad_norm": 0.11093062162399292,
238
- "learning_rate": 3.541666666666667e-05,
239
- "loss": 0.0266,
 
 
240
  "step": 310
241
  },
242
  {
243
  "epoch": 2.0,
244
- "grad_norm": 0.10339082777500153,
245
- "learning_rate": 3.3333333333333335e-05,
246
- "loss": 0.0265,
 
 
247
  "step": 320
248
  },
249
  {
250
  "epoch": 2.0,
251
- "eval_runtime": 11.299,
252
- "eval_samples_per_second": 14.161,
253
- "eval_steps_per_second": 0.885,
254
  "step": 320
255
  },
256
  {
257
- "epoch": 2.0625,
258
- "grad_norm": 0.10130282491445541,
259
- "learning_rate": 3.125e-05,
260
- "loss": 0.026,
261
- "step": 330
262
- },
263
- {
264
- "epoch": 2.125,
265
- "grad_norm": 0.10614955425262451,
266
- "learning_rate": 2.916666666666667e-05,
267
- "loss": 0.0257,
268
- "step": 340
269
- },
270
- {
271
- "epoch": 2.1875,
272
- "grad_norm": 0.1143997386097908,
273
- "learning_rate": 2.7083333333333332e-05,
274
- "loss": 0.0261,
275
- "step": 350
276
- },
277
- {
278
- "epoch": 2.25,
279
- "grad_norm": 0.10745866596698761,
280
- "learning_rate": 2.5e-05,
281
- "loss": 0.0256,
282
- "step": 360
283
- },
284
- {
285
- "epoch": 2.3125,
286
- "grad_norm": 0.1030006930232048,
287
- "learning_rate": 2.2916666666666667e-05,
288
- "loss": 0.0258,
289
- "step": 370
290
- },
291
- {
292
- "epoch": 2.375,
293
- "grad_norm": 0.10611753165721893,
294
- "learning_rate": 2.0833333333333336e-05,
295
- "loss": 0.026,
296
- "step": 380
297
- },
298
- {
299
- "epoch": 2.4375,
300
- "grad_norm": 0.10169661045074463,
301
- "learning_rate": 1.8750000000000002e-05,
302
- "loss": 0.0257,
303
- "step": 390
304
- },
305
- {
306
- "epoch": 2.5,
307
- "grad_norm": 0.11938793212175369,
308
- "learning_rate": 1.6666666666666667e-05,
309
- "loss": 0.0258,
310
- "step": 400
311
- },
312
- {
313
- "epoch": 2.5625,
314
- "grad_norm": 0.0978316143155098,
315
- "learning_rate": 1.4583333333333335e-05,
316
- "loss": 0.0259,
317
- "step": 410
318
- },
319
- {
320
- "epoch": 2.625,
321
- "grad_norm": 0.09611309319734573,
322
- "learning_rate": 1.25e-05,
323
- "loss": 0.0258,
324
- "step": 420
325
- },
326
- {
327
- "epoch": 2.6875,
328
- "grad_norm": 0.10421048849821091,
329
- "learning_rate": 1.0416666666666668e-05,
330
- "loss": 0.0258,
331
- "step": 430
332
- },
333
- {
334
- "epoch": 2.75,
335
- "grad_norm": 0.09502692520618439,
336
- "learning_rate": 8.333333333333334e-06,
337
- "loss": 0.0257,
338
- "step": 440
339
- },
340
- {
341
- "epoch": 2.8125,
342
- "grad_norm": 0.10091052949428558,
343
- "learning_rate": 6.25e-06,
344
- "loss": 0.0256,
345
- "step": 450
346
- },
347
- {
348
- "epoch": 2.875,
349
- "grad_norm": 0.10061470419168472,
350
- "learning_rate": 4.166666666666667e-06,
351
- "loss": 0.0257,
352
- "step": 460
353
- },
354
- {
355
- "epoch": 2.9375,
356
- "grad_norm": 0.09805355966091156,
357
- "learning_rate": 2.0833333333333334e-06,
358
- "loss": 0.0255,
359
- "step": 470
360
- },
361
- {
362
- "epoch": 3.0,
363
- "grad_norm": 0.11753270030021667,
364
- "learning_rate": 0.0,
365
- "loss": 0.0256,
366
- "step": 480
367
- },
368
- {
369
- "epoch": 3.0,
370
- "eval_runtime": 11.3126,
371
- "eval_samples_per_second": 14.144,
372
- "eval_steps_per_second": 0.884,
373
- "step": 480
374
- },
375
- {
376
- "epoch": 3.0,
377
- "step": 480,
378
  "total_flos": 0.0,
379
- "train_loss": 0.11339402707914512,
380
- "train_runtime": 443.7998,
381
- "train_samples_per_second": 4.326,
382
- "train_steps_per_second": 1.082
383
  }
384
  ],
385
  "logging_steps": 10,
386
- "max_steps": 480,
387
  "num_input_tokens_seen": 0,
388
- "num_train_epochs": 3,
389
  "save_steps": 500,
390
  "stateful_callbacks": {
391
  "TrainerControl": {
 
1
  {
2
+ "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 2.0,
6
  "eval_steps": 500,
7
+ "global_step": 320,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 0.0625,
14
+ "grad_norm": 0.6001169085502625,
15
+ "learning_rate": 9.718750000000001e-05,
16
+ "loss": 1.9881,
17
+ "mean_token_accuracy": 0.7977039694786072,
18
+ "num_tokens": 81920.0,
19
  "step": 10
20
  },
21
  {
22
  "epoch": 0.125,
23
+ "grad_norm": 0.46209943294525146,
24
+ "learning_rate": 9.40625e-05,
25
+ "loss": 0.1785,
26
+ "mean_token_accuracy": 0.9618344008922577,
27
+ "num_tokens": 163840.0,
28
  "step": 20
29
  },
30
  {
31
  "epoch": 0.1875,
32
+ "grad_norm": 0.25115829706192017,
33
+ "learning_rate": 9.093750000000001e-05,
34
+ "loss": 0.075,
35
+ "mean_token_accuracy": 0.9808378159999848,
36
+ "num_tokens": 245760.0,
37
  "step": 30
38
  },
39
  {
40
  "epoch": 0.25,
41
+ "grad_norm": 0.18340203166007996,
42
+ "learning_rate": 8.781250000000001e-05,
43
+ "loss": 0.0518,
44
+ "mean_token_accuracy": 0.98480703830719,
45
+ "num_tokens": 327680.0,
46
  "step": 40
47
  },
48
  {
49
  "epoch": 0.3125,
50
+ "grad_norm": 0.16958290338516235,
51
+ "learning_rate": 8.46875e-05,
52
+ "loss": 0.0439,
53
+ "mean_token_accuracy": 0.9856253087520599,
54
+ "num_tokens": 409600.0,
55
  "step": 50
56
  },
57
  {
58
  "epoch": 0.375,
59
+ "grad_norm": 0.18090437352657318,
60
+ "learning_rate": 8.156250000000001e-05,
61
+ "loss": 0.0362,
62
+ "mean_token_accuracy": 0.9867611169815064,
63
+ "num_tokens": 491520.0,
64
  "step": 60
65
  },
66
  {
67
  "epoch": 0.4375,
68
+ "grad_norm": 0.14541096985340118,
69
+ "learning_rate": 7.84375e-05,
70
+ "loss": 0.0324,
71
+ "mean_token_accuracy": 0.9876038134098053,
72
+ "num_tokens": 573440.0,
73
  "step": 70
74
  },
75
  {
76
  "epoch": 0.5,
77
+ "grad_norm": 0.3248767852783203,
78
+ "learning_rate": 7.531250000000001e-05,
79
+ "loss": 0.0309,
80
+ "mean_token_accuracy": 0.9874206185340881,
81
+ "num_tokens": 655360.0,
82
  "step": 80
83
  },
84
  {
85
  "epoch": 0.5625,
86
+ "grad_norm": 0.12345177680253983,
87
+ "learning_rate": 7.218750000000001e-05,
88
+ "loss": 0.03,
89
+ "mean_token_accuracy": 0.9874084055423736,
90
+ "num_tokens": 737280.0,
91
  "step": 90
92
  },
93
  {
94
  "epoch": 0.625,
95
+ "grad_norm": 0.12568464875221252,
96
+ "learning_rate": 6.90625e-05,
97
+ "loss": 0.0288,
98
+ "mean_token_accuracy": 0.9879335641860962,
99
+ "num_tokens": 819200.0,
100
  "step": 100
101
  },
102
  {
103
  "epoch": 0.6875,
104
+ "grad_norm": 0.11818379908800125,
105
+ "learning_rate": 6.593750000000001e-05,
106
  "loss": 0.0288,
107
+ "mean_token_accuracy": 0.9873839795589447,
108
+ "num_tokens": 901120.0,
109
  "step": 110
110
  },
111
  {
112
  "epoch": 0.6875,
113
+ "eval_runtime": 11.8942,
114
+ "eval_samples_per_second": 13.452,
115
+ "eval_steps_per_second": 0.841,
116
  "step": 110
117
  },
118
  {
119
  "epoch": 0.75,
120
+ "grad_norm": 0.12166234850883484,
121
+ "learning_rate": 6.28125e-05,
122
+ "loss": 0.0282,
123
+ "mean_token_accuracy": 0.9881167590618134,
124
+ "num_tokens": 983040.0,
125
  "step": 120
126
  },
127
  {
128
  "epoch": 0.8125,
129
+ "grad_norm": 0.1256779283285141,
130
+ "learning_rate": 5.968750000000001e-05,
131
+ "loss": 0.0279,
132
+ "mean_token_accuracy": 0.9878847122192382,
133
+ "num_tokens": 1064960.0,
134
  "step": 130
135
  },
136
  {
137
  "epoch": 0.875,
138
+ "grad_norm": 0.11304116994142532,
139
+ "learning_rate": 5.6562500000000006e-05,
140
+ "loss": 0.0276,
141
+ "mean_token_accuracy": 0.987799221277237,
142
+ "num_tokens": 1146880.0,
143
  "step": 140
144
  },
145
  {
146
  "epoch": 0.9375,
147
+ "grad_norm": 0.10594538599252701,
148
+ "learning_rate": 5.3437500000000005e-05,
149
+ "loss": 0.0276,
150
+ "mean_token_accuracy": 0.9880923330783844,
151
+ "num_tokens": 1228800.0,
152
  "step": 150
153
  },
154
  {
155
  "epoch": 1.0,
156
+ "grad_norm": 0.1052316427230835,
157
+ "learning_rate": 5.031250000000001e-05,
158
+ "loss": 0.0274,
159
+ "mean_token_accuracy": 0.9881289720535278,
160
+ "num_tokens": 1310720.0,
161
  "step": 160
162
  },
163
  {
164
  "epoch": 1.0,
165
+ "eval_runtime": 10.8105,
166
+ "eval_samples_per_second": 14.8,
167
+ "eval_steps_per_second": 0.925,
168
  "step": 160
169
  },
170
  {
171
  "epoch": 1.0625,
172
+ "grad_norm": 0.11138579249382019,
173
+ "learning_rate": 4.71875e-05,
174
  "loss": 0.0267,
175
+ "mean_token_accuracy": 0.9883243799209595,
176
+ "num_tokens": 1392640.0,
177
  "step": 170
178
  },
179
  {
180
  "epoch": 1.125,
181
+ "grad_norm": 0.11131029576063156,
182
+ "learning_rate": 4.40625e-05,
183
  "loss": 0.027,
184
+ "mean_token_accuracy": 0.9881411850452423,
185
+ "num_tokens": 1474560.0,
186
  "step": 180
187
  },
188
  {
189
  "epoch": 1.1875,
190
+ "grad_norm": 0.11658758670091629,
191
+ "learning_rate": 4.09375e-05,
192
+ "loss": 0.0269,
193
+ "mean_token_accuracy": 0.9882755279541016,
194
+ "num_tokens": 1556480.0,
195
  "step": 190
196
  },
197
  {
198
  "epoch": 1.25,
199
+ "grad_norm": 0.11032383143901825,
200
+ "learning_rate": 3.78125e-05,
201
+ "loss": 0.0266,
202
+ "mean_token_accuracy": 0.9884098708629608,
203
+ "num_tokens": 1638400.0,
204
  "step": 200
205
  },
206
  {
207
  "epoch": 1.3125,
208
+ "grad_norm": 0.10452437400817871,
209
+ "learning_rate": 3.46875e-05,
210
+ "loss": 0.0267,
211
+ "mean_token_accuracy": 0.9882388889789582,
212
+ "num_tokens": 1720320.0,
213
  "step": 210
214
  },
215
  {
216
  "epoch": 1.375,
217
+ "grad_norm": 0.10454697161912918,
218
+ "learning_rate": 3.15625e-05,
219
+ "loss": 0.0267,
220
+ "mean_token_accuracy": 0.9878969252109527,
221
+ "num_tokens": 1802240.0,
222
  "step": 220
223
  },
224
  {
225
  "epoch": 1.4375,
226
+ "grad_norm": 0.09663768112659454,
227
+ "learning_rate": 2.84375e-05,
228
+ "loss": 0.0266,
229
+ "mean_token_accuracy": 0.9883976578712463,
230
+ "num_tokens": 1884160.0,
231
  "step": 230
232
  },
233
  {
234
  "epoch": 1.5,
235
+ "grad_norm": 0.11553214490413666,
236
+ "learning_rate": 2.53125e-05,
237
+ "loss": 0.0267,
238
+ "mean_token_accuracy": 0.9884098708629608,
239
+ "num_tokens": 1966080.0,
240
  "step": 240
241
  },
242
  {
243
  "epoch": 1.5625,
244
+ "grad_norm": 0.11412502825260162,
245
+ "learning_rate": 2.21875e-05,
246
+ "loss": 0.0263,
247
+ "mean_token_accuracy": 0.9883854448795318,
248
+ "num_tokens": 2048000.0,
249
  "step": 250
250
  },
251
  {
252
  "epoch": 1.625,
253
+ "grad_norm": 0.1027199923992157,
254
+ "learning_rate": 1.90625e-05,
255
+ "loss": 0.0264,
256
+ "mean_token_accuracy": 0.9882999539375306,
257
+ "num_tokens": 2129920.0,
258
  "step": 260
259
  },
260
  {
261
  "epoch": 1.6875,
262
+ "grad_norm": 0.12076633423566818,
263
+ "learning_rate": 1.59375e-05,
264
+ "loss": 0.0264,
265
+ "mean_token_accuracy": 0.9882266759872437,
266
+ "num_tokens": 2211840.0,
267
  "step": 270
268
  },
269
  {
270
  "epoch": 1.75,
271
+ "grad_norm": 0.1111675500869751,
272
+ "learning_rate": 1.28125e-05,
273
+ "loss": 0.0264,
274
+ "mean_token_accuracy": 0.9879335641860962,
275
+ "num_tokens": 2293760.0,
276
  "step": 280
277
  },
278
  {
279
  "epoch": 1.8125,
280
+ "grad_norm": 0.1163574606180191,
281
+ "learning_rate": 9.6875e-06,
282
+ "loss": 0.0262,
283
+ "mean_token_accuracy": 0.9885197877883911,
284
+ "num_tokens": 2375680.0,
285
  "step": 290
286
  },
287
  {
288
  "epoch": 1.875,
289
+ "grad_norm": 0.11461054533720016,
290
+ "learning_rate": 6.5625e-06,
291
+ "loss": 0.0261,
292
+ "mean_token_accuracy": 0.988593065738678,
293
+ "num_tokens": 2457600.0,
294
  "step": 300
295
  },
296
  {
297
  "epoch": 1.9375,
298
+ "grad_norm": 0.10500755161046982,
299
+ "learning_rate": 3.4375000000000005e-06,
300
+ "loss": 0.0263,
301
+ "mean_token_accuracy": 0.9882633149623871,
302
+ "num_tokens": 2539520.0,
303
  "step": 310
304
  },
305
  {
306
  "epoch": 2.0,
307
+ "grad_norm": 0.11479007452726364,
308
+ "learning_rate": 3.125e-07,
309
+ "loss": 0.0262,
310
+ "mean_token_accuracy": 0.988336592912674,
311
+ "num_tokens": 2621440.0,
312
  "step": 320
313
  },
314
  {
315
  "epoch": 2.0,
316
+ "eval_runtime": 10.8162,
317
+ "eval_samples_per_second": 14.793,
318
+ "eval_steps_per_second": 0.925,
319
  "step": 320
320
  },
321
  {
322
+ "epoch": 2.0,
323
+ "step": 320,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
324
  "total_flos": 0.0,
325
+ "train_loss": 0.09647794626653194,
326
+ "train_runtime": 276.4027,
327
+ "train_samples_per_second": 4.631,
328
+ "train_steps_per_second": 1.158
329
  }
330
  ],
331
  "logging_steps": 10,
332
+ "max_steps": 320,
333
  "num_input_tokens_seen": 0,
334
+ "num_train_epochs": 2,
335
  "save_steps": 500,
336
  "stateful_callbacks": {
337
  "TrainerControl": {
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d3ea36db01dbb1c5915e1ff553ab1d2d31cd6119900ddbf872951d161132b290
3
- size 5432
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2d1ad38d9466f8a2da26fdcc2bc0cfe7f4da2baa9299dcd78d391608ab03e27
3
+ size 6033