ErkaMarka commited on
Commit
856ee30
·
verified ·
1 Parent(s): fcc65a7

Upload mongolian-mistral-7b-chatbot

Browse files
README.md ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - mn
4
+ license: apache-2.0
5
+ base_model: mistralai/Mistral-7B-Instruct-v0.2
6
+ tags:
7
+ - mongolian
8
+ - fine-tuned
9
+ - lora
10
+ - chatbot
11
+ datasets:
12
+ - custom
13
+ ---
14
+
15
+ # mongolian-mistral-7b-chatbot
16
+
17
+ ## Description
18
+ Mistral 7B fine-tuned on Mongolian news data for chatbot
19
+
20
+ ## Model Details
21
+ - **Base Model:** mistralai/Mistral-7B-Instruct-v0.2
22
+ - **Language:** Mongolian (mn)
23
+ - **Fine-tuning Method:** LoRA (Low-Rank Adaptation)
24
+ - **Training Data:** Eduge Mongolian News Dataset (75,000+ articles)
25
+
26
+ ## Training Configuration
27
+ - **LoRA Rank:** 32
28
+ - **LoRA Alpha:** 64
29
+ - **Epochs:** 3
30
+ - **Learning Rate:** 2e-4
31
+ - **Batch Size:** 4
32
+ - **Max Sequence Length:** 1024
33
+
34
+ ## Mongolian Tokens Added
35
+ - Total new tokens: ~9,500
36
+ - Sources: Mongolian-NLP repository
37
+ - Most frequent words
38
+ - Abbreviations
39
+ - District/place names
40
+ - Country names
41
+ - Named entities (NER)
42
+
43
+ ## Usage
44
+ ```python
45
+ from transformers import AutoModelForCausalLM, AutoTokenizer
46
+ from peft import PeftModel
47
+
48
+ # Load tokenizer
49
+ tokenizer = AutoTokenizer.from_pretrained("ErkaMarka/mongolian-mistral-7b-chatbot")
50
+
51
+ # Load base model
52
+ base_model = AutoModelForCausalLM.from_pretrained(
53
+ "mistralai/Mistral-7B-Instruct-v0.2",
54
+ torch_dtype=torch.float16,
55
+ device_map="auto"
56
+ )
57
+
58
+ # Resize embeddings for new tokens
59
+ base_model.resize_token_embeddings(len(tokenizer))
60
+
61
+ # Load LoRA adapter
62
+ model = PeftModel.from_pretrained(base_model, "ErkaMarka/mongolian-mistral-7b-chatbot")
63
+
64
+ # Generate
65
+ messages = [{"role": "user", "content": "Монгол улсын нийслэл хот юу вэ?"}]
66
+ text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
67
+ inputs = tokenizer(text, return_tensors="pt").to(model.device)
68
+
69
+ outputs = model.generate(**inputs, max_new_tokens=150)
70
+ print(tokenizer.decode(outputs[0], skip_special_tokens=True))
71
+ ```
72
+
73
+ ## Evaluation Results
74
+ Evaluated on 100 Mongolian Q&A pairs using BLEU score.
75
+
76
+ ## License
77
+ Apache 2.0
78
+
79
+ ## Citation
80
+ ```
81
+ @misc{mongolian_mistral_7b_chatbot},
82
+ author = {Your Name},
83
+ title = {mongolian-mistral-7b-chatbot},
84
+ year = {2024},
85
+ publisher = {Hugging Face},
86
+ url = {https://huggingface.co/ErkaMarka/mongolian-mistral-7b-chatbot}
87
+ }
88
+ ```
adapter_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.2",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 64,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.0",
27
+ "qalora_group_size": 16,
28
+ "r": 32,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "gate_proj",
33
+ "v_proj",
34
+ "up_proj",
35
+ "k_proj",
36
+ "down_proj",
37
+ "o_proj",
38
+ "q_proj"
39
+ ],
40
+ "target_parameters": null,
41
+ "task_type": "CAUSAL_LM",
42
+ "trainable_token_indices": null,
43
+ "use_dora": false,
44
+ "use_qalora": false,
45
+ "use_rslora": false
46
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a1eee9c4e2ea5c1715066b179c6310891607fce306c1fbbcccd5bec9e964658
3
+ size 335604696
added_tokens.json ADDED
The diff for this file is too large to render. See raw diff
 
chat_template.jinja ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if messages[0]['role'] == 'system' %}
2
+ {%- set system_message = messages[0]['content'] %}
3
+ {%- set loop_messages = messages[1:] %}
4
+ {%- else %}
5
+ {%- set loop_messages = messages %}
6
+ {%- endif %}
7
+
8
+ {{- bos_token }}
9
+ {%- for message in loop_messages %}
10
+ {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}
11
+ {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}
12
+ {%- endif %}
13
+ {%- if message['role'] == 'user' %}
14
+ {%- if loop.first and system_message is defined %}
15
+ {{- ' [INST] ' + system_message + '\n\n' + message['content'] + ' [/INST]' }}
16
+ {%- else %}
17
+ {{- ' [INST] ' + message['content'] + ' [/INST]' }}
18
+ {%- endif %}
19
+ {%- elif message['role'] == 'assistant' %}
20
+ {{- ' ' + message['content'] + eos_token}}
21
+ {%- else %}
22
+ {{- raise_exception('Only user and assistant roles are supported, with the exception of an initial optional system message!') }}
23
+ {%- endif %}
24
+ {%- endfor %}
checkpoint-8500/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: mistralai/Mistral-7B-Instruct-v0.2
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:mistralai/Mistral-7B-Instruct-v0.2
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.18.0
checkpoint-8500/adapter_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.2",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 64,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.0",
27
+ "qalora_group_size": 16,
28
+ "r": 32,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "gate_proj",
33
+ "v_proj",
34
+ "up_proj",
35
+ "k_proj",
36
+ "down_proj",
37
+ "o_proj",
38
+ "q_proj"
39
+ ],
40
+ "target_parameters": null,
41
+ "task_type": "CAUSAL_LM",
42
+ "trainable_token_indices": null,
43
+ "use_dora": false,
44
+ "use_qalora": false,
45
+ "use_rslora": false
46
+ }
checkpoint-8500/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:260a083e78e7446beabba4dc2220c33d1368a3b49255e54036ce6b89f4acc62a
3
+ size 335604696
checkpoint-8500/chat_template.jinja ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if messages[0]['role'] == 'system' %}
2
+ {%- set system_message = messages[0]['content'] %}
3
+ {%- set loop_messages = messages[1:] %}
4
+ {%- else %}
5
+ {%- set loop_messages = messages %}
6
+ {%- endif %}
7
+
8
+ {{- bos_token }}
9
+ {%- for message in loop_messages %}
10
+ {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}
11
+ {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}
12
+ {%- endif %}
13
+ {%- if message['role'] == 'user' %}
14
+ {%- if loop.first and system_message is defined %}
15
+ {{- ' [INST] ' + system_message + '\n\n' + message['content'] + ' [/INST]' }}
16
+ {%- else %}
17
+ {{- ' [INST] ' + message['content'] + ' [/INST]' }}
18
+ {%- endif %}
19
+ {%- elif message['role'] == 'assistant' %}
20
+ {{- ' ' + message['content'] + eos_token}}
21
+ {%- else %}
22
+ {{- raise_exception('Only user and assistant roles are supported, with the exception of an initial optional system message!') }}
23
+ {%- endif %}
24
+ {%- endfor %}
checkpoint-8500/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c33fe14dbd5003d189d64654761c330120b32218191d14801ade54126b08712
3
+ size 671466706
checkpoint-8500/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c7f87da9eb31a8f186e20c50eff2dab7a1ac22eb3c77f52d0da900c4cb7170c9
3
+ size 14244
checkpoint-8500/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:244785c20c15168a893df15900ee311660c9bceabe3d8c118350af5529973fa6
3
+ size 988
checkpoint-8500/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ccfa9c8898c4dd40288d270472ea45c541256a641c9d0960d647c40fba7f444
3
+ size 1064
checkpoint-8500/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
checkpoint-8500/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-8500/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
3
+ size 493443
checkpoint-8500/tokenizer_config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "additional_special_tokens": [],
32
+ "bos_token": "<s>",
33
+ "clean_up_tokenization_spaces": false,
34
+ "eos_token": "</s>",
35
+ "extra_special_tokens": {},
36
+ "legacy": false,
37
+ "model_max_length": 1000000000000000019884624838656,
38
+ "pad_token": "</s>",
39
+ "sp_model_kwargs": {},
40
+ "spaces_between_special_tokens": false,
41
+ "tokenizer_class": "LlamaTokenizer",
42
+ "unk_token": "<unk>",
43
+ "use_default_system_prompt": false
44
+ }
checkpoint-8500/trainer_state.json ADDED
@@ -0,0 +1,1360 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 2.867892694449131,
6
+ "eval_steps": 500,
7
+ "global_step": 8500,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.016871941960519655,
14
+ "grad_norm": 2.0177435874938965,
15
+ "learning_rate": 3.670411985018727e-05,
16
+ "loss": 1.9696,
17
+ "step": 50
18
+ },
19
+ {
20
+ "epoch": 0.03374388392103931,
21
+ "grad_norm": 1.9107558727264404,
22
+ "learning_rate": 7.415730337078653e-05,
23
+ "loss": 1.5446,
24
+ "step": 100
25
+ },
26
+ {
27
+ "epoch": 0.05061582588155897,
28
+ "grad_norm": 1.3704568147659302,
29
+ "learning_rate": 0.00011161048689138578,
30
+ "loss": 1.3815,
31
+ "step": 150
32
+ },
33
+ {
34
+ "epoch": 0.06748776784207862,
35
+ "grad_norm": 1.3545180559158325,
36
+ "learning_rate": 0.00014906367041198505,
37
+ "loss": 1.2514,
38
+ "step": 200
39
+ },
40
+ {
41
+ "epoch": 0.08435970980259828,
42
+ "grad_norm": 1.2799007892608643,
43
+ "learning_rate": 0.00018651685393258427,
44
+ "loss": 1.1922,
45
+ "step": 250
46
+ },
47
+ {
48
+ "epoch": 0.10123165176311794,
49
+ "grad_norm": 1.1978015899658203,
50
+ "learning_rate": 0.0001999932072351269,
51
+ "loss": 1.1556,
52
+ "step": 300
53
+ },
54
+ {
55
+ "epoch": 0.11810359372363759,
56
+ "grad_norm": 0.9664444923400879,
57
+ "learning_rate": 0.00019995539875714444,
58
+ "loss": 1.0915,
59
+ "step": 350
60
+ },
61
+ {
62
+ "epoch": 0.13497553568415724,
63
+ "grad_norm": 1.0787692070007324,
64
+ "learning_rate": 0.0001998844378161928,
65
+ "loss": 1.0562,
66
+ "step": 400
67
+ },
68
+ {
69
+ "epoch": 0.1518474776446769,
70
+ "grad_norm": 1.0797227621078491,
71
+ "learning_rate": 0.00019978034794806892,
72
+ "loss": 1.0253,
73
+ "step": 450
74
+ },
75
+ {
76
+ "epoch": 0.16871941960519657,
77
+ "grad_norm": 0.9610917568206787,
78
+ "learning_rate": 0.00019964316367652584,
79
+ "loss": 1.0084,
80
+ "step": 500
81
+ },
82
+ {
83
+ "epoch": 0.16871941960519657,
84
+ "eval_loss": 0.9864674210548401,
85
+ "eval_runtime": 297.0281,
86
+ "eval_samples_per_second": 8.403,
87
+ "eval_steps_per_second": 1.05,
88
+ "step": 500
89
+ },
90
+ {
91
+ "epoch": 0.1855913615657162,
92
+ "grad_norm": 1.020251750946045,
93
+ "learning_rate": 0.00019947293050182204,
94
+ "loss": 0.9978,
95
+ "step": 550
96
+ },
97
+ {
98
+ "epoch": 0.20246330352623587,
99
+ "grad_norm": 1.024293303489685,
100
+ "learning_rate": 0.00019926970488563033,
101
+ "loss": 0.9751,
102
+ "step": 600
103
+ },
104
+ {
105
+ "epoch": 0.21933524548675554,
106
+ "grad_norm": 1.01112699508667,
107
+ "learning_rate": 0.00019903355423231105,
108
+ "loss": 0.9533,
109
+ "step": 650
110
+ },
111
+ {
112
+ "epoch": 0.23620718744727517,
113
+ "grad_norm": 0.9841225147247314,
114
+ "learning_rate": 0.00019876455686655583,
115
+ "loss": 0.9311,
116
+ "step": 700
117
+ },
118
+ {
119
+ "epoch": 0.2530791294077948,
120
+ "grad_norm": 0.96900475025177,
121
+ "learning_rate": 0.00019846280200740965,
122
+ "loss": 0.9292,
123
+ "step": 750
124
+ },
125
+ {
126
+ "epoch": 0.2699510713683145,
127
+ "grad_norm": 0.9467640519142151,
128
+ "learning_rate": 0.000198128389738679,
129
+ "loss": 0.9316,
130
+ "step": 800
131
+ },
132
+ {
133
+ "epoch": 0.28682301332883414,
134
+ "grad_norm": 0.9673274755477905,
135
+ "learning_rate": 0.00019776143097573705,
136
+ "loss": 0.8972,
137
+ "step": 850
138
+ },
139
+ {
140
+ "epoch": 0.3036949552893538,
141
+ "grad_norm": 0.8616816401481628,
142
+ "learning_rate": 0.00019736204742873604,
143
+ "loss": 0.8998,
144
+ "step": 900
145
+ },
146
+ {
147
+ "epoch": 0.3205668972498735,
148
+ "grad_norm": 0.9249860644340515,
149
+ "learning_rate": 0.00019693037156223942,
150
+ "loss": 0.8788,
151
+ "step": 950
152
+ },
153
+ {
154
+ "epoch": 0.33743883921039314,
155
+ "grad_norm": 0.8553484678268433,
156
+ "learning_rate": 0.00019646654655128672,
157
+ "loss": 0.8766,
158
+ "step": 1000
159
+ },
160
+ {
161
+ "epoch": 0.33743883921039314,
162
+ "eval_loss": 0.8600347638130188,
163
+ "eval_runtime": 296.4688,
164
+ "eval_samples_per_second": 8.419,
165
+ "eval_steps_per_second": 1.052,
166
+ "step": 1000
167
+ },
168
+ {
169
+ "epoch": 0.35431078117091275,
170
+ "grad_norm": 0.9525033235549927,
171
+ "learning_rate": 0.00019597072623390668,
172
+ "loss": 0.8831,
173
+ "step": 1050
174
+ },
175
+ {
176
+ "epoch": 0.3711827231314324,
177
+ "grad_norm": 0.9420183300971985,
178
+ "learning_rate": 0.00019544307506009313,
179
+ "loss": 0.8662,
180
+ "step": 1100
181
+ },
182
+ {
183
+ "epoch": 0.3880546650919521,
184
+ "grad_norm": 0.9950663447380066,
185
+ "learning_rate": 0.00019488376803726153,
186
+ "loss": 0.8687,
187
+ "step": 1150
188
+ },
189
+ {
190
+ "epoch": 0.40492660705247174,
191
+ "grad_norm": 0.8866828083992004,
192
+ "learning_rate": 0.00019429299067220387,
193
+ "loss": 0.8676,
194
+ "step": 1200
195
+ },
196
+ {
197
+ "epoch": 0.4217985490129914,
198
+ "grad_norm": 0.9560692310333252,
199
+ "learning_rate": 0.00019367093890956108,
200
+ "loss": 0.8552,
201
+ "step": 1250
202
+ },
203
+ {
204
+ "epoch": 0.4386704909735111,
205
+ "grad_norm": 0.9126181602478027,
206
+ "learning_rate": 0.00019301781906683362,
207
+ "loss": 0.8335,
208
+ "step": 1300
209
+ },
210
+ {
211
+ "epoch": 0.4555424329340307,
212
+ "grad_norm": 0.9437697529792786,
213
+ "learning_rate": 0.0001923338477659515,
214
+ "loss": 0.8424,
215
+ "step": 1350
216
+ },
217
+ {
218
+ "epoch": 0.47241437489455035,
219
+ "grad_norm": 0.9593287110328674,
220
+ "learning_rate": 0.00019161925186142692,
221
+ "loss": 0.8386,
222
+ "step": 1400
223
+ },
224
+ {
225
+ "epoch": 0.48928631685507,
226
+ "grad_norm": 0.9925290942192078,
227
+ "learning_rate": 0.00019087426836511277,
228
+ "loss": 0.8431,
229
+ "step": 1450
230
+ },
231
+ {
232
+ "epoch": 0.5061582588155896,
233
+ "grad_norm": 0.9618675112724304,
234
+ "learning_rate": 0.00019009914436759223,
235
+ "loss": 0.8299,
236
+ "step": 1500
237
+ },
238
+ {
239
+ "epoch": 0.5061582588155896,
240
+ "eval_loss": 0.8129043579101562,
241
+ "eval_runtime": 296.3926,
242
+ "eval_samples_per_second": 8.421,
243
+ "eval_steps_per_second": 1.053,
244
+ "step": 1500
245
+ },
246
+ {
247
+ "epoch": 0.5230302007761093,
248
+ "grad_norm": 0.951849639415741,
249
+ "learning_rate": 0.00018929413695622572,
250
+ "loss": 0.8211,
251
+ "step": 1550
252
+ },
253
+ {
254
+ "epoch": 0.539902142736629,
255
+ "grad_norm": 0.9019031524658203,
256
+ "learning_rate": 0.00018845951312988196,
257
+ "loss": 0.8234,
258
+ "step": 1600
259
+ },
260
+ {
261
+ "epoch": 0.5567740846971486,
262
+ "grad_norm": 0.9506643414497375,
263
+ "learning_rate": 0.00018759554971038196,
264
+ "loss": 0.8194,
265
+ "step": 1650
266
+ },
267
+ {
268
+ "epoch": 0.5736460266576683,
269
+ "grad_norm": 0.9535942077636719,
270
+ "learning_rate": 0.00018670253325068456,
271
+ "loss": 0.8017,
272
+ "step": 1700
273
+ },
274
+ {
275
+ "epoch": 0.590517968618188,
276
+ "grad_norm": 1.0262322425842285,
277
+ "learning_rate": 0.00018578075993984488,
278
+ "loss": 0.8162,
279
+ "step": 1750
280
+ },
281
+ {
282
+ "epoch": 0.6073899105787076,
283
+ "grad_norm": 0.8758232593536377,
284
+ "learning_rate": 0.00018483053550477649,
285
+ "loss": 0.7917,
286
+ "step": 1800
287
+ },
288
+ {
289
+ "epoch": 0.6242618525392273,
290
+ "grad_norm": 0.9430511593818665,
291
+ "learning_rate": 0.00018385217510885008,
292
+ "loss": 0.8057,
293
+ "step": 1850
294
+ },
295
+ {
296
+ "epoch": 0.641133794499747,
297
+ "grad_norm": 0.8898816108703613,
298
+ "learning_rate": 0.00018284600324736257,
299
+ "loss": 0.7983,
300
+ "step": 1900
301
+ },
302
+ {
303
+ "epoch": 0.6580057364602666,
304
+ "grad_norm": 0.8455677628517151,
305
+ "learning_rate": 0.00018181235363991087,
306
+ "loss": 0.7793,
307
+ "step": 1950
308
+ },
309
+ {
310
+ "epoch": 0.6748776784207863,
311
+ "grad_norm": 0.8179712295532227,
312
+ "learning_rate": 0.00018075156911970616,
313
+ "loss": 0.7874,
314
+ "step": 2000
315
+ },
316
+ {
317
+ "epoch": 0.6748776784207863,
318
+ "eval_loss": 0.774493932723999,
319
+ "eval_runtime": 296.6518,
320
+ "eval_samples_per_second": 8.414,
321
+ "eval_steps_per_second": 1.052,
322
+ "step": 2000
323
+ },
324
+ {
325
+ "epoch": 0.6917496203813059,
326
+ "grad_norm": 1.012557029724121,
327
+ "learning_rate": 0.00017966400151986562,
328
+ "loss": 0.7822,
329
+ "step": 2050
330
+ },
331
+ {
332
+ "epoch": 0.7086215623418255,
333
+ "grad_norm": 0.8059019446372986,
334
+ "learning_rate": 0.00017855001155671905,
335
+ "loss": 0.7862,
336
+ "step": 2100
337
+ },
338
+ {
339
+ "epoch": 0.7254935043023452,
340
+ "grad_norm": 0.8411868810653687,
341
+ "learning_rate": 0.00017740996871016903,
342
+ "loss": 0.7789,
343
+ "step": 2150
344
+ },
345
+ {
346
+ "epoch": 0.7423654462628648,
347
+ "grad_norm": 0.9554468989372253,
348
+ "learning_rate": 0.0001762442511011448,
349
+ "loss": 0.7709,
350
+ "step": 2200
351
+ },
352
+ {
353
+ "epoch": 0.7592373882233845,
354
+ "grad_norm": 0.947722852230072,
355
+ "learning_rate": 0.00017505324536618968,
356
+ "loss": 0.7572,
357
+ "step": 2250
358
+ },
359
+ {
360
+ "epoch": 0.7761093301839042,
361
+ "grad_norm": 1.0708439350128174,
362
+ "learning_rate": 0.0001738373465292245,
363
+ "loss": 0.775,
364
+ "step": 2300
365
+ },
366
+ {
367
+ "epoch": 0.7929812721444238,
368
+ "grad_norm": 0.9122579097747803,
369
+ "learning_rate": 0.00017259695787052895,
370
+ "loss": 0.7638,
371
+ "step": 2350
372
+ },
373
+ {
374
+ "epoch": 0.8098532141049435,
375
+ "grad_norm": 1.1625076532363892,
376
+ "learning_rate": 0.00017133249079298455,
377
+ "loss": 0.7654,
378
+ "step": 2400
379
+ },
380
+ {
381
+ "epoch": 0.8267251560654632,
382
+ "grad_norm": 0.971483051776886,
383
+ "learning_rate": 0.0001700443646856237,
384
+ "loss": 0.7503,
385
+ "step": 2450
386
+ },
387
+ {
388
+ "epoch": 0.8435970980259828,
389
+ "grad_norm": 0.9200385808944702,
390
+ "learning_rate": 0.0001687330067845297,
391
+ "loss": 0.7752,
392
+ "step": 2500
393
+ },
394
+ {
395
+ "epoch": 0.8435970980259828,
396
+ "eval_loss": 0.7435723543167114,
397
+ "eval_runtime": 296.1583,
398
+ "eval_samples_per_second": 8.428,
399
+ "eval_steps_per_second": 1.053,
400
+ "step": 2500
401
+ },
402
+ {
403
+ "epoch": 0.8604690399865025,
404
+ "grad_norm": 1.0115654468536377,
405
+ "learning_rate": 0.00016739885203113442,
406
+ "loss": 0.7602,
407
+ "step": 2550
408
+ },
409
+ {
410
+ "epoch": 0.8773409819470221,
411
+ "grad_norm": 1.0669232606887817,
412
+ "learning_rate": 0.00016604234292796007,
413
+ "loss": 0.7585,
414
+ "step": 2600
415
+ },
416
+ {
417
+ "epoch": 0.8942129239075418,
418
+ "grad_norm": 0.9092018604278564,
419
+ "learning_rate": 0.00016466392939185317,
420
+ "loss": 0.7534,
421
+ "step": 2650
422
+ },
423
+ {
424
+ "epoch": 0.9110848658680614,
425
+ "grad_norm": 1.232256293296814,
426
+ "learning_rate": 0.00016326406860475977,
427
+ "loss": 0.7418,
428
+ "step": 2700
429
+ },
430
+ {
431
+ "epoch": 0.927956807828581,
432
+ "grad_norm": 1.0810918807983398,
433
+ "learning_rate": 0.00016184322486209043,
434
+ "loss": 0.7439,
435
+ "step": 2750
436
+ },
437
+ {
438
+ "epoch": 0.9448287497891007,
439
+ "grad_norm": 0.9745123982429504,
440
+ "learning_rate": 0.00016040186941872631,
441
+ "loss": 0.7421,
442
+ "step": 2800
443
+ },
444
+ {
445
+ "epoch": 0.9617006917496204,
446
+ "grad_norm": 1.1044626235961914,
447
+ "learning_rate": 0.00015894048033271684,
448
+ "loss": 0.7388,
449
+ "step": 2850
450
+ },
451
+ {
452
+ "epoch": 0.97857263371014,
453
+ "grad_norm": 0.9833297729492188,
454
+ "learning_rate": 0.00015745954230672105,
455
+ "loss": 0.7364,
456
+ "step": 2900
457
+ },
458
+ {
459
+ "epoch": 0.9954445756706597,
460
+ "grad_norm": 1.0058021545410156,
461
+ "learning_rate": 0.00015595954652724485,
462
+ "loss": 0.742,
463
+ "step": 2950
464
+ },
465
+ {
466
+ "epoch": 1.012147798211574,
467
+ "grad_norm": 1.026547908782959,
468
+ "learning_rate": 0.00015444099050172807,
469
+ "loss": 0.6689,
470
+ "step": 3000
471
+ },
472
+ {
473
+ "epoch": 1.012147798211574,
474
+ "eval_loss": 0.7237228155136108,
475
+ "eval_runtime": 296.57,
476
+ "eval_samples_per_second": 8.416,
477
+ "eval_steps_per_second": 1.052,
478
+ "step": 3000
479
+ },
480
+ {
481
+ "epoch": 1.0290197401720937,
482
+ "grad_norm": 1.1261744499206543,
483
+ "learning_rate": 0.0001529043778935349,
484
+ "loss": 0.6521,
485
+ "step": 3050
486
+ },
487
+ {
488
+ "epoch": 1.0458916821326134,
489
+ "grad_norm": 1.1573985815048218,
490
+ "learning_rate": 0.00015138147018095146,
491
+ "loss": 0.6497,
492
+ "step": 3100
493
+ },
494
+ {
495
+ "epoch": 1.062763624093133,
496
+ "grad_norm": 1.098228931427002,
497
+ "learning_rate": 0.00014981061472467248,
498
+ "loss": 0.6544,
499
+ "step": 3150
500
+ },
501
+ {
502
+ "epoch": 1.0796355660536527,
503
+ "grad_norm": 1.1172102689743042,
504
+ "learning_rate": 0.00014822323845430378,
505
+ "loss": 0.6586,
506
+ "step": 3200
507
+ },
508
+ {
509
+ "epoch": 1.0965075080141724,
510
+ "grad_norm": 1.016932487487793,
511
+ "learning_rate": 0.0001466198678589963,
512
+ "loss": 0.6721,
513
+ "step": 3250
514
+ },
515
+ {
516
+ "epoch": 1.113379449974692,
517
+ "grad_norm": 0.972201406955719,
518
+ "learning_rate": 0.00014500103473277963,
519
+ "loss": 0.6687,
520
+ "step": 3300
521
+ },
522
+ {
523
+ "epoch": 1.1302513919352117,
524
+ "grad_norm": 1.0418280363082886,
525
+ "learning_rate": 0.0001433672759981806,
526
+ "loss": 0.6643,
527
+ "step": 3350
528
+ },
529
+ {
530
+ "epoch": 1.1471233338957314,
531
+ "grad_norm": 1.0220147371292114,
532
+ "learning_rate": 0.00014171913352814075,
533
+ "loss": 0.6538,
534
+ "step": 3400
535
+ },
536
+ {
537
+ "epoch": 1.163995275856251,
538
+ "grad_norm": 0.9467246532440186,
539
+ "learning_rate": 0.000140057153966292,
540
+ "loss": 0.6427,
541
+ "step": 3450
542
+ },
543
+ {
544
+ "epoch": 1.1808672178167707,
545
+ "grad_norm": 1.1602420806884766,
546
+ "learning_rate": 0.00013838188854564993,
547
+ "loss": 0.6496,
548
+ "step": 3500
549
+ },
550
+ {
551
+ "epoch": 1.1808672178167707,
552
+ "eval_loss": 0.6991020441055298,
553
+ "eval_runtime": 296.4391,
554
+ "eval_samples_per_second": 8.42,
555
+ "eval_steps_per_second": 1.052,
556
+ "step": 3500
557
+ },
558
+ {
559
+ "epoch": 1.1977391597772904,
560
+ "grad_norm": 1.0359673500061035,
561
+ "learning_rate": 0.00013669389290578491,
562
+ "loss": 0.6574,
563
+ "step": 3550
564
+ },
565
+ {
566
+ "epoch": 1.21461110173781,
567
+ "grad_norm": 1.0353167057037354,
568
+ "learning_rate": 0.0001349937269085317,
569
+ "loss": 0.6462,
570
+ "step": 3600
571
+ },
572
+ {
573
+ "epoch": 1.2314830436983297,
574
+ "grad_norm": 0.9980498552322388,
575
+ "learning_rate": 0.00013328195445229868,
576
+ "loss": 0.6515,
577
+ "step": 3650
578
+ },
579
+ {
580
+ "epoch": 1.2483549856588494,
581
+ "grad_norm": 1.051719069480896,
582
+ "learning_rate": 0.0001315591432850381,
583
+ "loss": 0.6546,
584
+ "step": 3700
585
+ },
586
+ {
587
+ "epoch": 1.265226927619369,
588
+ "grad_norm": 1.1057497262954712,
589
+ "learning_rate": 0.0001298258648159399,
590
+ "loss": 0.6313,
591
+ "step": 3750
592
+ },
593
+ {
594
+ "epoch": 1.2820988695798887,
595
+ "grad_norm": 1.1292750835418701,
596
+ "learning_rate": 0.0001280826939259106,
597
+ "loss": 0.6329,
598
+ "step": 3800
599
+ },
600
+ {
601
+ "epoch": 1.2989708115404084,
602
+ "grad_norm": 1.1297887563705444,
603
+ "learning_rate": 0.00012633020877690155,
604
+ "loss": 0.6384,
605
+ "step": 3850
606
+ },
607
+ {
608
+ "epoch": 1.3158427535009278,
609
+ "grad_norm": 0.9869160652160645,
610
+ "learning_rate": 0.00012456899062014806,
611
+ "loss": 0.6226,
612
+ "step": 3900
613
+ },
614
+ {
615
+ "epoch": 1.3327146954614477,
616
+ "grad_norm": 0.9497949481010437,
617
+ "learning_rate": 0.00012279962360338447,
618
+ "loss": 0.6225,
619
+ "step": 3950
620
+ },
621
+ {
622
+ "epoch": 1.3495866374219672,
623
+ "grad_norm": 1.053124189376831,
624
+ "learning_rate": 0.00012102269457709843,
625
+ "loss": 0.6196,
626
+ "step": 4000
627
+ },
628
+ {
629
+ "epoch": 1.3495866374219672,
630
+ "eval_loss": 0.6678062081336975,
631
+ "eval_runtime": 296.2326,
632
+ "eval_samples_per_second": 8.426,
633
+ "eval_steps_per_second": 1.053,
634
+ "step": 4000
635
+ },
636
+ {
637
+ "epoch": 1.366458579382487,
638
+ "grad_norm": 1.1627007722854614,
639
+ "learning_rate": 0.0001192387928998886,
640
+ "loss": 0.6527,
641
+ "step": 4050
642
+ },
643
+ {
644
+ "epoch": 1.3833305213430065,
645
+ "grad_norm": 1.329759955406189,
646
+ "learning_rate": 0.00011744851024299069,
647
+ "loss": 0.6297,
648
+ "step": 4100
649
+ },
650
+ {
651
+ "epoch": 1.4002024633035262,
652
+ "grad_norm": 1.1688311100006104,
653
+ "learning_rate": 0.00011565244039403622,
654
+ "loss": 0.63,
655
+ "step": 4150
656
+ },
657
+ {
658
+ "epoch": 1.4170744052640458,
659
+ "grad_norm": 0.9974623918533325,
660
+ "learning_rate": 0.00011385117906010953,
661
+ "loss": 0.6394,
662
+ "step": 4200
663
+ },
664
+ {
665
+ "epoch": 1.4339463472245655,
666
+ "grad_norm": 1.0659152269363403,
667
+ "learning_rate": 0.00011204532367016806,
668
+ "loss": 0.6181,
669
+ "step": 4250
670
+ },
671
+ {
672
+ "epoch": 1.4508182891850852,
673
+ "grad_norm": 1.0745680332183838,
674
+ "learning_rate": 0.00011027170545816326,
675
+ "loss": 0.6281,
676
+ "step": 4300
677
+ },
678
+ {
679
+ "epoch": 1.4676902311456048,
680
+ "grad_norm": 1.1593629121780396,
681
+ "learning_rate": 0.00010845852214547601,
682
+ "loss": 0.6296,
683
+ "step": 4350
684
+ },
685
+ {
686
+ "epoch": 1.4845621731061245,
687
+ "grad_norm": 1.3579237461090088,
688
+ "learning_rate": 0.00010664253337309687,
689
+ "loss": 0.6152,
690
+ "step": 4400
691
+ },
692
+ {
693
+ "epoch": 1.5014341150666441,
694
+ "grad_norm": 1.2364161014556885,
695
+ "learning_rate": 0.00010482434145467046,
696
+ "loss": 0.6067,
697
+ "step": 4450
698
+ },
699
+ {
700
+ "epoch": 1.5183060570271638,
701
+ "grad_norm": 1.0740337371826172,
702
+ "learning_rate": 0.00010300454943456457,
703
+ "loss": 0.6175,
704
+ "step": 4500
705
+ },
706
+ {
707
+ "epoch": 1.5183060570271638,
708
+ "eval_loss": 0.6415057182312012,
709
+ "eval_runtime": 296.7867,
710
+ "eval_samples_per_second": 8.41,
711
+ "eval_steps_per_second": 1.051,
712
+ "step": 4500
713
+ },
714
+ {
715
+ "epoch": 1.5351779989876835,
716
+ "grad_norm": 1.1068068742752075,
717
+ "learning_rate": 0.00010118376088785673,
718
+ "loss": 0.6221,
719
+ "step": 4550
720
+ },
721
+ {
722
+ "epoch": 1.5520499409482031,
723
+ "grad_norm": 1.1202526092529297,
724
+ "learning_rate": 9.936257972014506e-05,
725
+ "loss": 0.6198,
726
+ "step": 4600
727
+ },
728
+ {
729
+ "epoch": 1.5689218829087228,
730
+ "grad_norm": 1.1293288469314575,
731
+ "learning_rate": 9.754160996724927e-05,
732
+ "loss": 0.5997,
733
+ "step": 4650
734
+ },
735
+ {
736
+ "epoch": 1.5857938248692425,
737
+ "grad_norm": 1.0531474351882935,
738
+ "learning_rate": 9.572145559486855e-05,
739
+ "loss": 0.6041,
740
+ "step": 4700
741
+ },
742
+ {
743
+ "epoch": 1.6026657668297621,
744
+ "grad_norm": 0.895604133605957,
745
+ "learning_rate": 9.390272029826282e-05,
746
+ "loss": 0.6005,
747
+ "step": 4750
748
+ },
749
+ {
750
+ "epoch": 1.6195377087902818,
751
+ "grad_norm": 1.1192911863327026,
752
+ "learning_rate": 9.208600730202339e-05,
753
+ "loss": 0.5992,
754
+ "step": 4800
755
+ },
756
+ {
757
+ "epoch": 1.6364096507508015,
758
+ "grad_norm": 1.1751166582107544,
759
+ "learning_rate": 9.027191916000018e-05,
760
+ "loss": 0.586,
761
+ "step": 4850
762
+ },
763
+ {
764
+ "epoch": 1.6532815927113211,
765
+ "grad_norm": 0.9682310223579407,
766
+ "learning_rate": 8.846105755545086e-05,
767
+ "loss": 0.5969,
768
+ "step": 4900
769
+ },
770
+ {
771
+ "epoch": 1.6701535346718406,
772
+ "grad_norm": 1.1701383590698242,
773
+ "learning_rate": 8.665402310147924e-05,
774
+ "loss": 0.579,
775
+ "step": 4950
776
+ },
777
+ {
778
+ "epoch": 1.6870254766323605,
779
+ "grad_norm": 1.0403779745101929,
780
+ "learning_rate": 8.485141514182825e-05,
781
+ "loss": 0.5788,
782
+ "step": 5000
783
+ },
784
+ {
785
+ "epoch": 1.6870254766323605,
786
+ "eval_loss": 0.6112694144248962,
787
+ "eval_runtime": 296.234,
788
+ "eval_samples_per_second": 8.426,
789
+ "eval_steps_per_second": 1.053,
790
+ "step": 5000
791
+ },
792
+ {
793
+ "epoch": 1.70389741859288,
794
+ "grad_norm": 1.126141905784607,
795
+ "learning_rate": 8.305383155209414e-05,
796
+ "loss": 0.5862,
797
+ "step": 5050
798
+ },
799
+ {
800
+ "epoch": 1.7207693605533998,
801
+ "grad_norm": 1.0474798679351807,
802
+ "learning_rate": 8.126186854142752e-05,
803
+ "loss": 0.579,
804
+ "step": 5100
805
+ },
806
+ {
807
+ "epoch": 1.7376413025139192,
808
+ "grad_norm": 2.020526647567749,
809
+ "learning_rate": 7.947612045478724e-05,
810
+ "loss": 0.5636,
811
+ "step": 5150
812
+ },
813
+ {
814
+ "epoch": 1.7545132444744391,
815
+ "grad_norm": 1.0213319063186646,
816
+ "learning_rate": 7.76971795758122e-05,
817
+ "loss": 0.5695,
818
+ "step": 5200
819
+ },
820
+ {
821
+ "epoch": 1.7713851864349586,
822
+ "grad_norm": 1.034336805343628,
823
+ "learning_rate": 7.592563593037746e-05,
824
+ "loss": 0.5849,
825
+ "step": 5250
826
+ },
827
+ {
828
+ "epoch": 1.7882571283954785,
829
+ "grad_norm": 1.0699772834777832,
830
+ "learning_rate": 7.41972662287419e-05,
831
+ "loss": 0.5714,
832
+ "step": 5300
833
+ },
834
+ {
835
+ "epoch": 1.805129070355998,
836
+ "grad_norm": 1.1747502088546753,
837
+ "learning_rate": 7.244210001050232e-05,
838
+ "loss": 0.5604,
839
+ "step": 5350
840
+ },
841
+ {
842
+ "epoch": 1.8220010123165178,
843
+ "grad_norm": 1.082960605621338,
844
+ "learning_rate": 7.069607399149428e-05,
845
+ "loss": 0.5551,
846
+ "step": 5400
847
+ },
848
+ {
849
+ "epoch": 1.8388729542770372,
850
+ "grad_norm": 0.9143629670143127,
851
+ "learning_rate": 6.895976728063694e-05,
852
+ "loss": 0.5581,
853
+ "step": 5450
854
+ },
855
+ {
856
+ "epoch": 1.855744896237557,
857
+ "grad_norm": 1.0392253398895264,
858
+ "learning_rate": 6.723375576322166e-05,
859
+ "loss": 0.5506,
860
+ "step": 5500
861
+ },
862
+ {
863
+ "epoch": 1.855744896237557,
864
+ "eval_loss": 0.5858550667762756,
865
+ "eval_runtime": 296.794,
866
+ "eval_samples_per_second": 8.41,
867
+ "eval_steps_per_second": 1.051,
868
+ "step": 5500
869
+ },
870
+ {
871
+ "epoch": 1.8726168381980766,
872
+ "grad_norm": 1.1118271350860596,
873
+ "learning_rate": 6.551861190990665e-05,
874
+ "loss": 0.5508,
875
+ "step": 5550
876
+ },
877
+ {
878
+ "epoch": 1.8894887801585962,
879
+ "grad_norm": 1.0717848539352417,
880
+ "learning_rate": 6.381490458684407e-05,
881
+ "loss": 0.5489,
882
+ "step": 5600
883
+ },
884
+ {
885
+ "epoch": 1.906360722119116,
886
+ "grad_norm": 0.9712995290756226,
887
+ "learning_rate": 6.212319886700289e-05,
888
+ "loss": 0.547,
889
+ "step": 5650
890
+ },
891
+ {
892
+ "epoch": 1.9232326640796356,
893
+ "grad_norm": 1.0615884065628052,
894
+ "learning_rate": 6.044405584274961e-05,
895
+ "loss": 0.54,
896
+ "step": 5700
897
+ },
898
+ {
899
+ "epoch": 1.9401046060401552,
900
+ "grad_norm": 1.0537705421447754,
901
+ "learning_rate": 5.8778032439749284e-05,
902
+ "loss": 0.5279,
903
+ "step": 5750
904
+ },
905
+ {
906
+ "epoch": 1.9569765480006749,
907
+ "grad_norm": 1.081616759300232,
908
+ "learning_rate": 5.7125681232248684e-05,
909
+ "loss": 0.5473,
910
+ "step": 5800
911
+ },
912
+ {
913
+ "epoch": 1.9738484899611946,
914
+ "grad_norm": 1.1939841508865356,
915
+ "learning_rate": 5.548755025980237e-05,
916
+ "loss": 0.5422,
917
+ "step": 5850
918
+ },
919
+ {
920
+ "epoch": 1.9907204319217142,
921
+ "grad_norm": 1.1471023559570312,
922
+ "learning_rate": 5.3864182845503296e-05,
923
+ "loss": 0.5484,
924
+ "step": 5900
925
+ },
926
+ {
927
+ "epoch": 2.0074236544626287,
928
+ "grad_norm": 1.2721205949783325,
929
+ "learning_rate": 5.225611741577716e-05,
930
+ "loss": 0.4849,
931
+ "step": 5950
932
+ },
933
+ {
934
+ "epoch": 2.024295596423148,
935
+ "grad_norm": 1.3426976203918457,
936
+ "learning_rate": 5.066388732180136e-05,
937
+ "loss": 0.4106,
938
+ "step": 6000
939
+ },
940
+ {
941
+ "epoch": 2.024295596423148,
942
+ "eval_loss": 0.563025176525116,
943
+ "eval_runtime": 296.3567,
944
+ "eval_samples_per_second": 8.422,
945
+ "eval_steps_per_second": 1.053,
946
+ "step": 6000
947
+ },
948
+ {
949
+ "epoch": 2.041167538383668,
950
+ "grad_norm": 1.2091578245162964,
951
+ "learning_rate": 4.908802066260697e-05,
952
+ "loss": 0.3968,
953
+ "step": 6050
954
+ },
955
+ {
956
+ "epoch": 2.0580394803441875,
957
+ "grad_norm": 1.1719856262207031,
958
+ "learning_rate": 4.7529040109922584e-05,
959
+ "loss": 0.4055,
960
+ "step": 6100
961
+ },
962
+ {
963
+ "epoch": 2.0749114223047074,
964
+ "grad_norm": 1.3152216672897339,
965
+ "learning_rate": 4.598746273481881e-05,
966
+ "loss": 0.395,
967
+ "step": 6150
968
+ },
969
+ {
970
+ "epoch": 2.091783364265227,
971
+ "grad_norm": 1.2058014869689941,
972
+ "learning_rate": 4.446379983620979e-05,
973
+ "loss": 0.3895,
974
+ "step": 6200
975
+ },
976
+ {
977
+ "epoch": 2.1086553062257467,
978
+ "grad_norm": 1.1317533254623413,
979
+ "learning_rate": 4.2988477878823355e-05,
980
+ "loss": 0.4027,
981
+ "step": 6250
982
+ },
983
+ {
984
+ "epoch": 2.125527248186266,
985
+ "grad_norm": 1.241541862487793,
986
+ "learning_rate": 4.1501770661428595e-05,
987
+ "loss": 0.411,
988
+ "step": 6300
989
+ },
990
+ {
991
+ "epoch": 2.142399190146786,
992
+ "grad_norm": 1.170419454574585,
993
+ "learning_rate": 4.003446570150093e-05,
994
+ "loss": 0.3995,
995
+ "step": 6350
996
+ },
997
+ {
998
+ "epoch": 2.1592711321073055,
999
+ "grad_norm": 1.35177743434906,
1000
+ "learning_rate": 3.858704966383232e-05,
1001
+ "loss": 0.4068,
1002
+ "step": 6400
1003
+ },
1004
+ {
1005
+ "epoch": 2.1761430740678254,
1006
+ "grad_norm": 1.2839558124542236,
1007
+ "learning_rate": 3.71600026166051e-05,
1008
+ "loss": 0.4122,
1009
+ "step": 6450
1010
+ },
1011
+ {
1012
+ "epoch": 2.193015016028345,
1013
+ "grad_norm": 1.3839994668960571,
1014
+ "learning_rate": 3.575379787216629e-05,
1015
+ "loss": 0.4044,
1016
+ "step": 6500
1017
+ },
1018
+ {
1019
+ "epoch": 2.193015016028345,
1020
+ "eval_loss": 0.5397977232933044,
1021
+ "eval_runtime": 296.486,
1022
+ "eval_samples_per_second": 8.419,
1023
+ "eval_steps_per_second": 1.052,
1024
+ "step": 6500
1025
+ },
1026
+ {
1027
+ "epoch": 2.2098869579888647,
1028
+ "grad_norm": 1.2803045511245728,
1029
+ "learning_rate": 3.436890183004309e-05,
1030
+ "loss": 0.3822,
1031
+ "step": 6550
1032
+ },
1033
+ {
1034
+ "epoch": 2.226758899949384,
1035
+ "grad_norm": 1.3744126558303833,
1036
+ "learning_rate": 3.300577382225076e-05,
1037
+ "loss": 0.3915,
1038
+ "step": 6600
1039
+ },
1040
+ {
1041
+ "epoch": 2.243630841909904,
1042
+ "grad_norm": 1.1824839115142822,
1043
+ "learning_rate": 3.1664865960945e-05,
1044
+ "loss": 0.3884,
1045
+ "step": 6650
1046
+ },
1047
+ {
1048
+ "epoch": 2.2605027838704235,
1049
+ "grad_norm": 1.3948158025741577,
1050
+ "learning_rate": 3.03466229884686e-05,
1051
+ "loss": 0.382,
1052
+ "step": 6700
1053
+ },
1054
+ {
1055
+ "epoch": 2.277374725830943,
1056
+ "grad_norm": 1.2800052165985107,
1057
+ "learning_rate": 2.9051482129842577e-05,
1058
+ "loss": 0.3914,
1059
+ "step": 6750
1060
+ },
1061
+ {
1062
+ "epoch": 2.294246667791463,
1063
+ "grad_norm": 1.3162052631378174,
1064
+ "learning_rate": 2.777987294775086e-05,
1065
+ "loss": 0.3865,
1066
+ "step": 6800
1067
+ },
1068
+ {
1069
+ "epoch": 2.3111186097519827,
1070
+ "grad_norm": 1.1745421886444092,
1071
+ "learning_rate": 2.6532217200065858e-05,
1072
+ "loss": 0.3752,
1073
+ "step": 6850
1074
+ },
1075
+ {
1076
+ "epoch": 2.327990551712502,
1077
+ "grad_norm": 1.5241754055023193,
1078
+ "learning_rate": 2.5308928699963153e-05,
1079
+ "loss": 0.368,
1080
+ "step": 6900
1081
+ },
1082
+ {
1083
+ "epoch": 2.3448624936730216,
1084
+ "grad_norm": 1.3691622018814087,
1085
+ "learning_rate": 2.4110413178670878e-05,
1086
+ "loss": 0.3715,
1087
+ "step": 6950
1088
+ },
1089
+ {
1090
+ "epoch": 2.3617344356335415,
1091
+ "grad_norm": 1.206244707107544,
1092
+ "learning_rate": 2.2937068150899967e-05,
1093
+ "loss": 0.3781,
1094
+ "step": 7000
1095
+ },
1096
+ {
1097
+ "epoch": 2.3617344356335415,
1098
+ "eval_loss": 0.5177870392799377,
1099
+ "eval_runtime": 296.651,
1100
+ "eval_samples_per_second": 8.414,
1101
+ "eval_steps_per_second": 1.052,
1102
+ "step": 7000
1103
+ },
1104
+ {
1105
+ "epoch": 2.378606377594061,
1106
+ "grad_norm": 1.3172132968902588,
1107
+ "learning_rate": 2.1789282782999254e-05,
1108
+ "loss": 0.3787,
1109
+ "step": 7050
1110
+ },
1111
+ {
1112
+ "epoch": 2.395478319554581,
1113
+ "grad_norm": 1.4043761491775513,
1114
+ "learning_rate": 2.066743776387974e-05,
1115
+ "loss": 0.3798,
1116
+ "step": 7100
1117
+ },
1118
+ {
1119
+ "epoch": 2.4123502615151002,
1120
+ "grad_norm": 1.2046414613723755,
1121
+ "learning_rate": 1.957190517875064e-05,
1122
+ "loss": 0.3755,
1123
+ "step": 7150
1124
+ },
1125
+ {
1126
+ "epoch": 2.42922220347562,
1127
+ "grad_norm": 1.214189052581787,
1128
+ "learning_rate": 1.850304838570879e-05,
1129
+ "loss": 0.3722,
1130
+ "step": 7200
1131
+ },
1132
+ {
1133
+ "epoch": 2.4460941454361396,
1134
+ "grad_norm": 1.3583543300628662,
1135
+ "learning_rate": 1.7461221895222724e-05,
1136
+ "loss": 0.3667,
1137
+ "step": 7250
1138
+ },
1139
+ {
1140
+ "epoch": 2.4629660873966595,
1141
+ "grad_norm": 1.5781126022338867,
1142
+ "learning_rate": 1.644677125255143e-05,
1143
+ "loss": 0.3672,
1144
+ "step": 7300
1145
+ },
1146
+ {
1147
+ "epoch": 2.479838029357179,
1148
+ "grad_norm": 1.2086187601089478,
1149
+ "learning_rate": 1.546003292313629e-05,
1150
+ "loss": 0.358,
1151
+ "step": 7350
1152
+ },
1153
+ {
1154
+ "epoch": 2.496709971317699,
1155
+ "grad_norm": 1.1627147197723389,
1156
+ "learning_rate": 1.4501334181004889e-05,
1157
+ "loss": 0.3826,
1158
+ "step": 7400
1159
+ },
1160
+ {
1161
+ "epoch": 2.5135819132782182,
1162
+ "grad_norm": 1.4055496454238892,
1163
+ "learning_rate": 1.3570993000223043e-05,
1164
+ "loss": 0.356,
1165
+ "step": 7450
1166
+ },
1167
+ {
1168
+ "epoch": 2.530453855238738,
1169
+ "grad_norm": 1.2799688577651978,
1170
+ "learning_rate": 1.2669317949431659e-05,
1171
+ "loss": 0.3576,
1172
+ "step": 7500
1173
+ },
1174
+ {
1175
+ "epoch": 2.530453855238738,
1176
+ "eval_loss": 0.49860402941703796,
1177
+ "eval_runtime": 296.3625,
1178
+ "eval_samples_per_second": 8.422,
1179
+ "eval_steps_per_second": 1.053,
1180
+ "step": 7500
1181
+ },
1182
+ {
1183
+ "epoch": 2.5473257971992576,
1184
+ "grad_norm": 1.321845531463623,
1185
+ "learning_rate": 1.1796608089502948e-05,
1186
+ "loss": 0.3661,
1187
+ "step": 7550
1188
+ },
1189
+ {
1190
+ "epoch": 2.5641977391597774,
1191
+ "grad_norm": 1.3550702333450317,
1192
+ "learning_rate": 1.0953152874350059e-05,
1193
+ "loss": 0.365,
1194
+ "step": 7600
1195
+ },
1196
+ {
1197
+ "epoch": 2.581069681120297,
1198
+ "grad_norm": 1.2584900856018066,
1199
+ "learning_rate": 1.0139232054923287e-05,
1200
+ "loss": 0.3535,
1201
+ "step": 7650
1202
+ },
1203
+ {
1204
+ "epoch": 2.5979416230808168,
1205
+ "grad_norm": 1.376833438873291,
1206
+ "learning_rate": 9.355115586424224e-06,
1207
+ "loss": 0.3502,
1208
+ "step": 7700
1209
+ },
1210
+ {
1211
+ "epoch": 2.614813565041336,
1212
+ "grad_norm": 1.1930843591690063,
1213
+ "learning_rate": 8.601063538769182e-06,
1214
+ "loss": 0.3705,
1215
+ "step": 7750
1216
+ },
1217
+ {
1218
+ "epoch": 2.6316855070018557,
1219
+ "grad_norm": 1.4217926263809204,
1220
+ "learning_rate": 7.877326010330977e-06,
1221
+ "loss": 0.3459,
1222
+ "step": 7800
1223
+ },
1224
+ {
1225
+ "epoch": 2.6485574489623755,
1226
+ "grad_norm": 1.2762850522994995,
1227
+ "learning_rate": 7.1841430449882895e-06,
1228
+ "loss": 0.3436,
1229
+ "step": 7850
1230
+ },
1231
+ {
1232
+ "epoch": 2.6654293909228954,
1233
+ "grad_norm": 1.2758060693740845,
1234
+ "learning_rate": 6.521744552509635e-06,
1235
+ "loss": 0.3575,
1236
+ "step": 7900
1237
+ },
1238
+ {
1239
+ "epoch": 2.682301332883415,
1240
+ "grad_norm": 1.4101016521453857,
1241
+ "learning_rate": 5.890350232298591e-06,
1242
+ "loss": 0.3513,
1243
+ "step": 7950
1244
+ },
1245
+ {
1246
+ "epoch": 2.6991732748439343,
1247
+ "grad_norm": 1.1934149265289307,
1248
+ "learning_rate": 5.290169500525577e-06,
1249
+ "loss": 0.3472,
1250
+ "step": 8000
1251
+ },
1252
+ {
1253
+ "epoch": 2.6991732748439343,
1254
+ "eval_loss": 0.4890361726284027,
1255
+ "eval_runtime": 296.7147,
1256
+ "eval_samples_per_second": 8.412,
1257
+ "eval_steps_per_second": 1.052,
1258
+ "step": 8000
1259
+ },
1260
+ {
1261
+ "epoch": 2.716045216804454,
1262
+ "grad_norm": 1.1966798305511475,
1263
+ "learning_rate": 4.721401420670224e-06,
1264
+ "loss": 0.3476,
1265
+ "step": 8050
1266
+ },
1267
+ {
1268
+ "epoch": 2.732917158764974,
1269
+ "grad_norm": 1.326464295387268,
1270
+ "learning_rate": 4.184234637497486e-06,
1271
+ "loss": 0.3607,
1272
+ "step": 8100
1273
+ },
1274
+ {
1275
+ "epoch": 2.7497891007254935,
1276
+ "grad_norm": 1.1875063180923462,
1277
+ "learning_rate": 3.6788473144893976e-06,
1278
+ "loss": 0.3421,
1279
+ "step": 8150
1280
+ },
1281
+ {
1282
+ "epoch": 2.766661042686013,
1283
+ "grad_norm": 1.318291425704956,
1284
+ "learning_rate": 3.20540707475302e-06,
1285
+ "loss": 0.3567,
1286
+ "step": 8200
1287
+ },
1288
+ {
1289
+ "epoch": 2.783532984646533,
1290
+ "grad_norm": 1.3796924352645874,
1291
+ "learning_rate": 2.7640709454245904e-06,
1292
+ "loss": 0.3543,
1293
+ "step": 8250
1294
+ },
1295
+ {
1296
+ "epoch": 2.8004049266070523,
1297
+ "grad_norm": 1.188733696937561,
1298
+ "learning_rate": 2.3549853055878314e-06,
1299
+ "loss": 0.3461,
1300
+ "step": 8300
1301
+ },
1302
+ {
1303
+ "epoch": 2.817276868567572,
1304
+ "grad_norm": 1.4971429109573364,
1305
+ "learning_rate": 1.978285837724092e-06,
1306
+ "loss": 0.345,
1307
+ "step": 8350
1308
+ },
1309
+ {
1310
+ "epoch": 2.8341488105280916,
1311
+ "grad_norm": 1.218836784362793,
1312
+ "learning_rate": 1.6340974827101286e-06,
1313
+ "loss": 0.3628,
1314
+ "step": 8400
1315
+ },
1316
+ {
1317
+ "epoch": 2.8510207524886115,
1318
+ "grad_norm": 1.3097290992736816,
1319
+ "learning_rate": 1.3225343983787054e-06,
1320
+ "loss": 0.3515,
1321
+ "step": 8450
1322
+ },
1323
+ {
1324
+ "epoch": 2.867892694449131,
1325
+ "grad_norm": 1.602677822113037,
1326
+ "learning_rate": 1.0436999216555276e-06,
1327
+ "loss": 0.3506,
1328
+ "step": 8500
1329
+ },
1330
+ {
1331
+ "epoch": 2.867892694449131,
1332
+ "eval_loss": 0.48525503277778625,
1333
+ "eval_runtime": 296.1461,
1334
+ "eval_samples_per_second": 8.428,
1335
+ "eval_steps_per_second": 1.054,
1336
+ "step": 8500
1337
+ }
1338
+ ],
1339
+ "logging_steps": 50,
1340
+ "max_steps": 8892,
1341
+ "num_input_tokens_seen": 0,
1342
+ "num_train_epochs": 3,
1343
+ "save_steps": 500,
1344
+ "stateful_callbacks": {
1345
+ "TrainerControl": {
1346
+ "args": {
1347
+ "should_epoch_stop": false,
1348
+ "should_evaluate": false,
1349
+ "should_log": false,
1350
+ "should_save": true,
1351
+ "should_training_stop": false
1352
+ },
1353
+ "attributes": {}
1354
+ }
1355
+ },
1356
+ "total_flos": 5.897794151360692e+18,
1357
+ "train_batch_size": 8,
1358
+ "trial_name": null,
1359
+ "trial_params": null
1360
+ }
checkpoint-8500/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89962928316ba9167e9dc9efd4961d21c08e95c6bd6d9ad576dbd0f3c12dcdaf
3
+ size 5688
checkpoint-8892/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: mistralai/Mistral-7B-Instruct-v0.2
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:mistralai/Mistral-7B-Instruct-v0.2
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.18.0
checkpoint-8892/adapter_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.2",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 64,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.0",
27
+ "qalora_group_size": 16,
28
+ "r": 32,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "gate_proj",
33
+ "v_proj",
34
+ "up_proj",
35
+ "k_proj",
36
+ "down_proj",
37
+ "o_proj",
38
+ "q_proj"
39
+ ],
40
+ "target_parameters": null,
41
+ "task_type": "CAUSAL_LM",
42
+ "trainable_token_indices": null,
43
+ "use_dora": false,
44
+ "use_qalora": false,
45
+ "use_rslora": false
46
+ }
checkpoint-8892/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a1eee9c4e2ea5c1715066b179c6310891607fce306c1fbbcccd5bec9e964658
3
+ size 335604696
checkpoint-8892/chat_template.jinja ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if messages[0]['role'] == 'system' %}
2
+ {%- set system_message = messages[0]['content'] %}
3
+ {%- set loop_messages = messages[1:] %}
4
+ {%- else %}
5
+ {%- set loop_messages = messages %}
6
+ {%- endif %}
7
+
8
+ {{- bos_token }}
9
+ {%- for message in loop_messages %}
10
+ {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}
11
+ {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}
12
+ {%- endif %}
13
+ {%- if message['role'] == 'user' %}
14
+ {%- if loop.first and system_message is defined %}
15
+ {{- ' [INST] ' + system_message + '\n\n' + message['content'] + ' [/INST]' }}
16
+ {%- else %}
17
+ {{- ' [INST] ' + message['content'] + ' [/INST]' }}
18
+ {%- endif %}
19
+ {%- elif message['role'] == 'assistant' %}
20
+ {{- ' ' + message['content'] + eos_token}}
21
+ {%- else %}
22
+ {{- raise_exception('Only user and assistant roles are supported, with the exception of an initial optional system message!') }}
23
+ {%- endif %}
24
+ {%- endfor %}
checkpoint-8892/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4d77f57ea57de18290a3c24d533d67672053aa6701c28dcd1bcb7c573c485f2
3
+ size 671466706
checkpoint-8892/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b9512d0f3bb196d04b0b009f14bcc5e93ec5a06396c54f604aaf155347c279f
3
+ size 14244
checkpoint-8892/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:55643735d58dd4a0834e482b714c2d08dd2314c07893bc0cfe906c0cc1756d43
3
+ size 988
checkpoint-8892/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:451501e506444b3ce6856a786bd233a9c0c78c93bbd16ff058d031b11754317d
3
+ size 1064
checkpoint-8892/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
checkpoint-8892/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-8892/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
3
+ size 493443
checkpoint-8892/tokenizer_config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "additional_special_tokens": [],
32
+ "bos_token": "<s>",
33
+ "clean_up_tokenization_spaces": false,
34
+ "eos_token": "</s>",
35
+ "extra_special_tokens": {},
36
+ "legacy": false,
37
+ "model_max_length": 1000000000000000019884624838656,
38
+ "pad_token": "</s>",
39
+ "sp_model_kwargs": {},
40
+ "spaces_between_special_tokens": false,
41
+ "tokenizer_class": "LlamaTokenizer",
42
+ "unk_token": "<unk>",
43
+ "use_default_system_prompt": false
44
+ }
checkpoint-8892/trainer_state.json ADDED
@@ -0,0 +1,1409 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 3.0,
6
+ "eval_steps": 500,
7
+ "global_step": 8892,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.016871941960519655,
14
+ "grad_norm": 2.0177435874938965,
15
+ "learning_rate": 3.670411985018727e-05,
16
+ "loss": 1.9696,
17
+ "step": 50
18
+ },
19
+ {
20
+ "epoch": 0.03374388392103931,
21
+ "grad_norm": 1.9107558727264404,
22
+ "learning_rate": 7.415730337078653e-05,
23
+ "loss": 1.5446,
24
+ "step": 100
25
+ },
26
+ {
27
+ "epoch": 0.05061582588155897,
28
+ "grad_norm": 1.3704568147659302,
29
+ "learning_rate": 0.00011161048689138578,
30
+ "loss": 1.3815,
31
+ "step": 150
32
+ },
33
+ {
34
+ "epoch": 0.06748776784207862,
35
+ "grad_norm": 1.3545180559158325,
36
+ "learning_rate": 0.00014906367041198505,
37
+ "loss": 1.2514,
38
+ "step": 200
39
+ },
40
+ {
41
+ "epoch": 0.08435970980259828,
42
+ "grad_norm": 1.2799007892608643,
43
+ "learning_rate": 0.00018651685393258427,
44
+ "loss": 1.1922,
45
+ "step": 250
46
+ },
47
+ {
48
+ "epoch": 0.10123165176311794,
49
+ "grad_norm": 1.1978015899658203,
50
+ "learning_rate": 0.0001999932072351269,
51
+ "loss": 1.1556,
52
+ "step": 300
53
+ },
54
+ {
55
+ "epoch": 0.11810359372363759,
56
+ "grad_norm": 0.9664444923400879,
57
+ "learning_rate": 0.00019995539875714444,
58
+ "loss": 1.0915,
59
+ "step": 350
60
+ },
61
+ {
62
+ "epoch": 0.13497553568415724,
63
+ "grad_norm": 1.0787692070007324,
64
+ "learning_rate": 0.0001998844378161928,
65
+ "loss": 1.0562,
66
+ "step": 400
67
+ },
68
+ {
69
+ "epoch": 0.1518474776446769,
70
+ "grad_norm": 1.0797227621078491,
71
+ "learning_rate": 0.00019978034794806892,
72
+ "loss": 1.0253,
73
+ "step": 450
74
+ },
75
+ {
76
+ "epoch": 0.16871941960519657,
77
+ "grad_norm": 0.9610917568206787,
78
+ "learning_rate": 0.00019964316367652584,
79
+ "loss": 1.0084,
80
+ "step": 500
81
+ },
82
+ {
83
+ "epoch": 0.16871941960519657,
84
+ "eval_loss": 0.9864674210548401,
85
+ "eval_runtime": 297.0281,
86
+ "eval_samples_per_second": 8.403,
87
+ "eval_steps_per_second": 1.05,
88
+ "step": 500
89
+ },
90
+ {
91
+ "epoch": 0.1855913615657162,
92
+ "grad_norm": 1.020251750946045,
93
+ "learning_rate": 0.00019947293050182204,
94
+ "loss": 0.9978,
95
+ "step": 550
96
+ },
97
+ {
98
+ "epoch": 0.20246330352623587,
99
+ "grad_norm": 1.024293303489685,
100
+ "learning_rate": 0.00019926970488563033,
101
+ "loss": 0.9751,
102
+ "step": 600
103
+ },
104
+ {
105
+ "epoch": 0.21933524548675554,
106
+ "grad_norm": 1.01112699508667,
107
+ "learning_rate": 0.00019903355423231105,
108
+ "loss": 0.9533,
109
+ "step": 650
110
+ },
111
+ {
112
+ "epoch": 0.23620718744727517,
113
+ "grad_norm": 0.9841225147247314,
114
+ "learning_rate": 0.00019876455686655583,
115
+ "loss": 0.9311,
116
+ "step": 700
117
+ },
118
+ {
119
+ "epoch": 0.2530791294077948,
120
+ "grad_norm": 0.96900475025177,
121
+ "learning_rate": 0.00019846280200740965,
122
+ "loss": 0.9292,
123
+ "step": 750
124
+ },
125
+ {
126
+ "epoch": 0.2699510713683145,
127
+ "grad_norm": 0.9467640519142151,
128
+ "learning_rate": 0.000198128389738679,
129
+ "loss": 0.9316,
130
+ "step": 800
131
+ },
132
+ {
133
+ "epoch": 0.28682301332883414,
134
+ "grad_norm": 0.9673274755477905,
135
+ "learning_rate": 0.00019776143097573705,
136
+ "loss": 0.8972,
137
+ "step": 850
138
+ },
139
+ {
140
+ "epoch": 0.3036949552893538,
141
+ "grad_norm": 0.8616816401481628,
142
+ "learning_rate": 0.00019736204742873604,
143
+ "loss": 0.8998,
144
+ "step": 900
145
+ },
146
+ {
147
+ "epoch": 0.3205668972498735,
148
+ "grad_norm": 0.9249860644340515,
149
+ "learning_rate": 0.00019693037156223942,
150
+ "loss": 0.8788,
151
+ "step": 950
152
+ },
153
+ {
154
+ "epoch": 0.33743883921039314,
155
+ "grad_norm": 0.8553484678268433,
156
+ "learning_rate": 0.00019646654655128672,
157
+ "loss": 0.8766,
158
+ "step": 1000
159
+ },
160
+ {
161
+ "epoch": 0.33743883921039314,
162
+ "eval_loss": 0.8600347638130188,
163
+ "eval_runtime": 296.4688,
164
+ "eval_samples_per_second": 8.419,
165
+ "eval_steps_per_second": 1.052,
166
+ "step": 1000
167
+ },
168
+ {
169
+ "epoch": 0.35431078117091275,
170
+ "grad_norm": 0.9525033235549927,
171
+ "learning_rate": 0.00019597072623390668,
172
+ "loss": 0.8831,
173
+ "step": 1050
174
+ },
175
+ {
176
+ "epoch": 0.3711827231314324,
177
+ "grad_norm": 0.9420183300971985,
178
+ "learning_rate": 0.00019544307506009313,
179
+ "loss": 0.8662,
180
+ "step": 1100
181
+ },
182
+ {
183
+ "epoch": 0.3880546650919521,
184
+ "grad_norm": 0.9950663447380066,
185
+ "learning_rate": 0.00019488376803726153,
186
+ "loss": 0.8687,
187
+ "step": 1150
188
+ },
189
+ {
190
+ "epoch": 0.40492660705247174,
191
+ "grad_norm": 0.8866828083992004,
192
+ "learning_rate": 0.00019429299067220387,
193
+ "loss": 0.8676,
194
+ "step": 1200
195
+ },
196
+ {
197
+ "epoch": 0.4217985490129914,
198
+ "grad_norm": 0.9560692310333252,
199
+ "learning_rate": 0.00019367093890956108,
200
+ "loss": 0.8552,
201
+ "step": 1250
202
+ },
203
+ {
204
+ "epoch": 0.4386704909735111,
205
+ "grad_norm": 0.9126181602478027,
206
+ "learning_rate": 0.00019301781906683362,
207
+ "loss": 0.8335,
208
+ "step": 1300
209
+ },
210
+ {
211
+ "epoch": 0.4555424329340307,
212
+ "grad_norm": 0.9437697529792786,
213
+ "learning_rate": 0.0001923338477659515,
214
+ "loss": 0.8424,
215
+ "step": 1350
216
+ },
217
+ {
218
+ "epoch": 0.47241437489455035,
219
+ "grad_norm": 0.9593287110328674,
220
+ "learning_rate": 0.00019161925186142692,
221
+ "loss": 0.8386,
222
+ "step": 1400
223
+ },
224
+ {
225
+ "epoch": 0.48928631685507,
226
+ "grad_norm": 0.9925290942192078,
227
+ "learning_rate": 0.00019087426836511277,
228
+ "loss": 0.8431,
229
+ "step": 1450
230
+ },
231
+ {
232
+ "epoch": 0.5061582588155896,
233
+ "grad_norm": 0.9618675112724304,
234
+ "learning_rate": 0.00019009914436759223,
235
+ "loss": 0.8299,
236
+ "step": 1500
237
+ },
238
+ {
239
+ "epoch": 0.5061582588155896,
240
+ "eval_loss": 0.8129043579101562,
241
+ "eval_runtime": 296.3926,
242
+ "eval_samples_per_second": 8.421,
243
+ "eval_steps_per_second": 1.053,
244
+ "step": 1500
245
+ },
246
+ {
247
+ "epoch": 0.5230302007761093,
248
+ "grad_norm": 0.951849639415741,
249
+ "learning_rate": 0.00018929413695622572,
250
+ "loss": 0.8211,
251
+ "step": 1550
252
+ },
253
+ {
254
+ "epoch": 0.539902142736629,
255
+ "grad_norm": 0.9019031524658203,
256
+ "learning_rate": 0.00018845951312988196,
257
+ "loss": 0.8234,
258
+ "step": 1600
259
+ },
260
+ {
261
+ "epoch": 0.5567740846971486,
262
+ "grad_norm": 0.9506643414497375,
263
+ "learning_rate": 0.00018759554971038196,
264
+ "loss": 0.8194,
265
+ "step": 1650
266
+ },
267
+ {
268
+ "epoch": 0.5736460266576683,
269
+ "grad_norm": 0.9535942077636719,
270
+ "learning_rate": 0.00018670253325068456,
271
+ "loss": 0.8017,
272
+ "step": 1700
273
+ },
274
+ {
275
+ "epoch": 0.590517968618188,
276
+ "grad_norm": 1.0262322425842285,
277
+ "learning_rate": 0.00018578075993984488,
278
+ "loss": 0.8162,
279
+ "step": 1750
280
+ },
281
+ {
282
+ "epoch": 0.6073899105787076,
283
+ "grad_norm": 0.8758232593536377,
284
+ "learning_rate": 0.00018483053550477649,
285
+ "loss": 0.7917,
286
+ "step": 1800
287
+ },
288
+ {
289
+ "epoch": 0.6242618525392273,
290
+ "grad_norm": 0.9430511593818665,
291
+ "learning_rate": 0.00018385217510885008,
292
+ "loss": 0.8057,
293
+ "step": 1850
294
+ },
295
+ {
296
+ "epoch": 0.641133794499747,
297
+ "grad_norm": 0.8898816108703613,
298
+ "learning_rate": 0.00018284600324736257,
299
+ "loss": 0.7983,
300
+ "step": 1900
301
+ },
302
+ {
303
+ "epoch": 0.6580057364602666,
304
+ "grad_norm": 0.8455677628517151,
305
+ "learning_rate": 0.00018181235363991087,
306
+ "loss": 0.7793,
307
+ "step": 1950
308
+ },
309
+ {
310
+ "epoch": 0.6748776784207863,
311
+ "grad_norm": 0.8179712295532227,
312
+ "learning_rate": 0.00018075156911970616,
313
+ "loss": 0.7874,
314
+ "step": 2000
315
+ },
316
+ {
317
+ "epoch": 0.6748776784207863,
318
+ "eval_loss": 0.774493932723999,
319
+ "eval_runtime": 296.6518,
320
+ "eval_samples_per_second": 8.414,
321
+ "eval_steps_per_second": 1.052,
322
+ "step": 2000
323
+ },
324
+ {
325
+ "epoch": 0.6917496203813059,
326
+ "grad_norm": 1.012557029724121,
327
+ "learning_rate": 0.00017966400151986562,
328
+ "loss": 0.7822,
329
+ "step": 2050
330
+ },
331
+ {
332
+ "epoch": 0.7086215623418255,
333
+ "grad_norm": 0.8059019446372986,
334
+ "learning_rate": 0.00017855001155671905,
335
+ "loss": 0.7862,
336
+ "step": 2100
337
+ },
338
+ {
339
+ "epoch": 0.7254935043023452,
340
+ "grad_norm": 0.8411868810653687,
341
+ "learning_rate": 0.00017740996871016903,
342
+ "loss": 0.7789,
343
+ "step": 2150
344
+ },
345
+ {
346
+ "epoch": 0.7423654462628648,
347
+ "grad_norm": 0.9554468989372253,
348
+ "learning_rate": 0.0001762442511011448,
349
+ "loss": 0.7709,
350
+ "step": 2200
351
+ },
352
+ {
353
+ "epoch": 0.7592373882233845,
354
+ "grad_norm": 0.947722852230072,
355
+ "learning_rate": 0.00017505324536618968,
356
+ "loss": 0.7572,
357
+ "step": 2250
358
+ },
359
+ {
360
+ "epoch": 0.7761093301839042,
361
+ "grad_norm": 1.0708439350128174,
362
+ "learning_rate": 0.0001738373465292245,
363
+ "loss": 0.775,
364
+ "step": 2300
365
+ },
366
+ {
367
+ "epoch": 0.7929812721444238,
368
+ "grad_norm": 0.9122579097747803,
369
+ "learning_rate": 0.00017259695787052895,
370
+ "loss": 0.7638,
371
+ "step": 2350
372
+ },
373
+ {
374
+ "epoch": 0.8098532141049435,
375
+ "grad_norm": 1.1625076532363892,
376
+ "learning_rate": 0.00017133249079298455,
377
+ "loss": 0.7654,
378
+ "step": 2400
379
+ },
380
+ {
381
+ "epoch": 0.8267251560654632,
382
+ "grad_norm": 0.971483051776886,
383
+ "learning_rate": 0.0001700443646856237,
384
+ "loss": 0.7503,
385
+ "step": 2450
386
+ },
387
+ {
388
+ "epoch": 0.8435970980259828,
389
+ "grad_norm": 0.9200385808944702,
390
+ "learning_rate": 0.0001687330067845297,
391
+ "loss": 0.7752,
392
+ "step": 2500
393
+ },
394
+ {
395
+ "epoch": 0.8435970980259828,
396
+ "eval_loss": 0.7435723543167114,
397
+ "eval_runtime": 296.1583,
398
+ "eval_samples_per_second": 8.428,
399
+ "eval_steps_per_second": 1.053,
400
+ "step": 2500
401
+ },
402
+ {
403
+ "epoch": 0.8604690399865025,
404
+ "grad_norm": 1.0115654468536377,
405
+ "learning_rate": 0.00016739885203113442,
406
+ "loss": 0.7602,
407
+ "step": 2550
408
+ },
409
+ {
410
+ "epoch": 0.8773409819470221,
411
+ "grad_norm": 1.0669232606887817,
412
+ "learning_rate": 0.00016604234292796007,
413
+ "loss": 0.7585,
414
+ "step": 2600
415
+ },
416
+ {
417
+ "epoch": 0.8942129239075418,
418
+ "grad_norm": 0.9092018604278564,
419
+ "learning_rate": 0.00016466392939185317,
420
+ "loss": 0.7534,
421
+ "step": 2650
422
+ },
423
+ {
424
+ "epoch": 0.9110848658680614,
425
+ "grad_norm": 1.232256293296814,
426
+ "learning_rate": 0.00016326406860475977,
427
+ "loss": 0.7418,
428
+ "step": 2700
429
+ },
430
+ {
431
+ "epoch": 0.927956807828581,
432
+ "grad_norm": 1.0810918807983398,
433
+ "learning_rate": 0.00016184322486209043,
434
+ "loss": 0.7439,
435
+ "step": 2750
436
+ },
437
+ {
438
+ "epoch": 0.9448287497891007,
439
+ "grad_norm": 0.9745123982429504,
440
+ "learning_rate": 0.00016040186941872631,
441
+ "loss": 0.7421,
442
+ "step": 2800
443
+ },
444
+ {
445
+ "epoch": 0.9617006917496204,
446
+ "grad_norm": 1.1044626235961914,
447
+ "learning_rate": 0.00015894048033271684,
448
+ "loss": 0.7388,
449
+ "step": 2850
450
+ },
451
+ {
452
+ "epoch": 0.97857263371014,
453
+ "grad_norm": 0.9833297729492188,
454
+ "learning_rate": 0.00015745954230672105,
455
+ "loss": 0.7364,
456
+ "step": 2900
457
+ },
458
+ {
459
+ "epoch": 0.9954445756706597,
460
+ "grad_norm": 1.0058021545410156,
461
+ "learning_rate": 0.00015595954652724485,
462
+ "loss": 0.742,
463
+ "step": 2950
464
+ },
465
+ {
466
+ "epoch": 1.012147798211574,
467
+ "grad_norm": 1.026547908782959,
468
+ "learning_rate": 0.00015444099050172807,
469
+ "loss": 0.6689,
470
+ "step": 3000
471
+ },
472
+ {
473
+ "epoch": 1.012147798211574,
474
+ "eval_loss": 0.7237228155136108,
475
+ "eval_runtime": 296.57,
476
+ "eval_samples_per_second": 8.416,
477
+ "eval_steps_per_second": 1.052,
478
+ "step": 3000
479
+ },
480
+ {
481
+ "epoch": 1.0290197401720937,
482
+ "grad_norm": 1.1261744499206543,
483
+ "learning_rate": 0.0001529043778935349,
484
+ "loss": 0.6521,
485
+ "step": 3050
486
+ },
487
+ {
488
+ "epoch": 1.0458916821326134,
489
+ "grad_norm": 1.1573985815048218,
490
+ "learning_rate": 0.00015138147018095146,
491
+ "loss": 0.6497,
492
+ "step": 3100
493
+ },
494
+ {
495
+ "epoch": 1.062763624093133,
496
+ "grad_norm": 1.098228931427002,
497
+ "learning_rate": 0.00014981061472467248,
498
+ "loss": 0.6544,
499
+ "step": 3150
500
+ },
501
+ {
502
+ "epoch": 1.0796355660536527,
503
+ "grad_norm": 1.1172102689743042,
504
+ "learning_rate": 0.00014822323845430378,
505
+ "loss": 0.6586,
506
+ "step": 3200
507
+ },
508
+ {
509
+ "epoch": 1.0965075080141724,
510
+ "grad_norm": 1.016932487487793,
511
+ "learning_rate": 0.0001466198678589963,
512
+ "loss": 0.6721,
513
+ "step": 3250
514
+ },
515
+ {
516
+ "epoch": 1.113379449974692,
517
+ "grad_norm": 0.972201406955719,
518
+ "learning_rate": 0.00014500103473277963,
519
+ "loss": 0.6687,
520
+ "step": 3300
521
+ },
522
+ {
523
+ "epoch": 1.1302513919352117,
524
+ "grad_norm": 1.0418280363082886,
525
+ "learning_rate": 0.0001433672759981806,
526
+ "loss": 0.6643,
527
+ "step": 3350
528
+ },
529
+ {
530
+ "epoch": 1.1471233338957314,
531
+ "grad_norm": 1.0220147371292114,
532
+ "learning_rate": 0.00014171913352814075,
533
+ "loss": 0.6538,
534
+ "step": 3400
535
+ },
536
+ {
537
+ "epoch": 1.163995275856251,
538
+ "grad_norm": 0.9467246532440186,
539
+ "learning_rate": 0.000140057153966292,
540
+ "loss": 0.6427,
541
+ "step": 3450
542
+ },
543
+ {
544
+ "epoch": 1.1808672178167707,
545
+ "grad_norm": 1.1602420806884766,
546
+ "learning_rate": 0.00013838188854564993,
547
+ "loss": 0.6496,
548
+ "step": 3500
549
+ },
550
+ {
551
+ "epoch": 1.1808672178167707,
552
+ "eval_loss": 0.6991020441055298,
553
+ "eval_runtime": 296.4391,
554
+ "eval_samples_per_second": 8.42,
555
+ "eval_steps_per_second": 1.052,
556
+ "step": 3500
557
+ },
558
+ {
559
+ "epoch": 1.1977391597772904,
560
+ "grad_norm": 1.0359673500061035,
561
+ "learning_rate": 0.00013669389290578491,
562
+ "loss": 0.6574,
563
+ "step": 3550
564
+ },
565
+ {
566
+ "epoch": 1.21461110173781,
567
+ "grad_norm": 1.0353167057037354,
568
+ "learning_rate": 0.0001349937269085317,
569
+ "loss": 0.6462,
570
+ "step": 3600
571
+ },
572
+ {
573
+ "epoch": 1.2314830436983297,
574
+ "grad_norm": 0.9980498552322388,
575
+ "learning_rate": 0.00013328195445229868,
576
+ "loss": 0.6515,
577
+ "step": 3650
578
+ },
579
+ {
580
+ "epoch": 1.2483549856588494,
581
+ "grad_norm": 1.051719069480896,
582
+ "learning_rate": 0.0001315591432850381,
583
+ "loss": 0.6546,
584
+ "step": 3700
585
+ },
586
+ {
587
+ "epoch": 1.265226927619369,
588
+ "grad_norm": 1.1057497262954712,
589
+ "learning_rate": 0.0001298258648159399,
590
+ "loss": 0.6313,
591
+ "step": 3750
592
+ },
593
+ {
594
+ "epoch": 1.2820988695798887,
595
+ "grad_norm": 1.1292750835418701,
596
+ "learning_rate": 0.0001280826939259106,
597
+ "loss": 0.6329,
598
+ "step": 3800
599
+ },
600
+ {
601
+ "epoch": 1.2989708115404084,
602
+ "grad_norm": 1.1297887563705444,
603
+ "learning_rate": 0.00012633020877690155,
604
+ "loss": 0.6384,
605
+ "step": 3850
606
+ },
607
+ {
608
+ "epoch": 1.3158427535009278,
609
+ "grad_norm": 0.9869160652160645,
610
+ "learning_rate": 0.00012456899062014806,
611
+ "loss": 0.6226,
612
+ "step": 3900
613
+ },
614
+ {
615
+ "epoch": 1.3327146954614477,
616
+ "grad_norm": 0.9497949481010437,
617
+ "learning_rate": 0.00012279962360338447,
618
+ "loss": 0.6225,
619
+ "step": 3950
620
+ },
621
+ {
622
+ "epoch": 1.3495866374219672,
623
+ "grad_norm": 1.053124189376831,
624
+ "learning_rate": 0.00012102269457709843,
625
+ "loss": 0.6196,
626
+ "step": 4000
627
+ },
628
+ {
629
+ "epoch": 1.3495866374219672,
630
+ "eval_loss": 0.6678062081336975,
631
+ "eval_runtime": 296.2326,
632
+ "eval_samples_per_second": 8.426,
633
+ "eval_steps_per_second": 1.053,
634
+ "step": 4000
635
+ },
636
+ {
637
+ "epoch": 1.366458579382487,
638
+ "grad_norm": 1.1627007722854614,
639
+ "learning_rate": 0.0001192387928998886,
640
+ "loss": 0.6527,
641
+ "step": 4050
642
+ },
643
+ {
644
+ "epoch": 1.3833305213430065,
645
+ "grad_norm": 1.329759955406189,
646
+ "learning_rate": 0.00011744851024299069,
647
+ "loss": 0.6297,
648
+ "step": 4100
649
+ },
650
+ {
651
+ "epoch": 1.4002024633035262,
652
+ "grad_norm": 1.1688311100006104,
653
+ "learning_rate": 0.00011565244039403622,
654
+ "loss": 0.63,
655
+ "step": 4150
656
+ },
657
+ {
658
+ "epoch": 1.4170744052640458,
659
+ "grad_norm": 0.9974623918533325,
660
+ "learning_rate": 0.00011385117906010953,
661
+ "loss": 0.6394,
662
+ "step": 4200
663
+ },
664
+ {
665
+ "epoch": 1.4339463472245655,
666
+ "grad_norm": 1.0659152269363403,
667
+ "learning_rate": 0.00011204532367016806,
668
+ "loss": 0.6181,
669
+ "step": 4250
670
+ },
671
+ {
672
+ "epoch": 1.4508182891850852,
673
+ "grad_norm": 1.0745680332183838,
674
+ "learning_rate": 0.00011027170545816326,
675
+ "loss": 0.6281,
676
+ "step": 4300
677
+ },
678
+ {
679
+ "epoch": 1.4676902311456048,
680
+ "grad_norm": 1.1593629121780396,
681
+ "learning_rate": 0.00010845852214547601,
682
+ "loss": 0.6296,
683
+ "step": 4350
684
+ },
685
+ {
686
+ "epoch": 1.4845621731061245,
687
+ "grad_norm": 1.3579237461090088,
688
+ "learning_rate": 0.00010664253337309687,
689
+ "loss": 0.6152,
690
+ "step": 4400
691
+ },
692
+ {
693
+ "epoch": 1.5014341150666441,
694
+ "grad_norm": 1.2364161014556885,
695
+ "learning_rate": 0.00010482434145467046,
696
+ "loss": 0.6067,
697
+ "step": 4450
698
+ },
699
+ {
700
+ "epoch": 1.5183060570271638,
701
+ "grad_norm": 1.0740337371826172,
702
+ "learning_rate": 0.00010300454943456457,
703
+ "loss": 0.6175,
704
+ "step": 4500
705
+ },
706
+ {
707
+ "epoch": 1.5183060570271638,
708
+ "eval_loss": 0.6415057182312012,
709
+ "eval_runtime": 296.7867,
710
+ "eval_samples_per_second": 8.41,
711
+ "eval_steps_per_second": 1.051,
712
+ "step": 4500
713
+ },
714
+ {
715
+ "epoch": 1.5351779989876835,
716
+ "grad_norm": 1.1068068742752075,
717
+ "learning_rate": 0.00010118376088785673,
718
+ "loss": 0.6221,
719
+ "step": 4550
720
+ },
721
+ {
722
+ "epoch": 1.5520499409482031,
723
+ "grad_norm": 1.1202526092529297,
724
+ "learning_rate": 9.936257972014506e-05,
725
+ "loss": 0.6198,
726
+ "step": 4600
727
+ },
728
+ {
729
+ "epoch": 1.5689218829087228,
730
+ "grad_norm": 1.1293288469314575,
731
+ "learning_rate": 9.754160996724927e-05,
732
+ "loss": 0.5997,
733
+ "step": 4650
734
+ },
735
+ {
736
+ "epoch": 1.5857938248692425,
737
+ "grad_norm": 1.0531474351882935,
738
+ "learning_rate": 9.572145559486855e-05,
739
+ "loss": 0.6041,
740
+ "step": 4700
741
+ },
742
+ {
743
+ "epoch": 1.6026657668297621,
744
+ "grad_norm": 0.895604133605957,
745
+ "learning_rate": 9.390272029826282e-05,
746
+ "loss": 0.6005,
747
+ "step": 4750
748
+ },
749
+ {
750
+ "epoch": 1.6195377087902818,
751
+ "grad_norm": 1.1192911863327026,
752
+ "learning_rate": 9.208600730202339e-05,
753
+ "loss": 0.5992,
754
+ "step": 4800
755
+ },
756
+ {
757
+ "epoch": 1.6364096507508015,
758
+ "grad_norm": 1.1751166582107544,
759
+ "learning_rate": 9.027191916000018e-05,
760
+ "loss": 0.586,
761
+ "step": 4850
762
+ },
763
+ {
764
+ "epoch": 1.6532815927113211,
765
+ "grad_norm": 0.9682310223579407,
766
+ "learning_rate": 8.846105755545086e-05,
767
+ "loss": 0.5969,
768
+ "step": 4900
769
+ },
770
+ {
771
+ "epoch": 1.6701535346718406,
772
+ "grad_norm": 1.1701383590698242,
773
+ "learning_rate": 8.665402310147924e-05,
774
+ "loss": 0.579,
775
+ "step": 4950
776
+ },
777
+ {
778
+ "epoch": 1.6870254766323605,
779
+ "grad_norm": 1.0403779745101929,
780
+ "learning_rate": 8.485141514182825e-05,
781
+ "loss": 0.5788,
782
+ "step": 5000
783
+ },
784
+ {
785
+ "epoch": 1.6870254766323605,
786
+ "eval_loss": 0.6112694144248962,
787
+ "eval_runtime": 296.234,
788
+ "eval_samples_per_second": 8.426,
789
+ "eval_steps_per_second": 1.053,
790
+ "step": 5000
791
+ },
792
+ {
793
+ "epoch": 1.70389741859288,
794
+ "grad_norm": 1.126141905784607,
795
+ "learning_rate": 8.305383155209414e-05,
796
+ "loss": 0.5862,
797
+ "step": 5050
798
+ },
799
+ {
800
+ "epoch": 1.7207693605533998,
801
+ "grad_norm": 1.0474798679351807,
802
+ "learning_rate": 8.126186854142752e-05,
803
+ "loss": 0.579,
804
+ "step": 5100
805
+ },
806
+ {
807
+ "epoch": 1.7376413025139192,
808
+ "grad_norm": 2.020526647567749,
809
+ "learning_rate": 7.947612045478724e-05,
810
+ "loss": 0.5636,
811
+ "step": 5150
812
+ },
813
+ {
814
+ "epoch": 1.7545132444744391,
815
+ "grad_norm": 1.0213319063186646,
816
+ "learning_rate": 7.76971795758122e-05,
817
+ "loss": 0.5695,
818
+ "step": 5200
819
+ },
820
+ {
821
+ "epoch": 1.7713851864349586,
822
+ "grad_norm": 1.034336805343628,
823
+ "learning_rate": 7.592563593037746e-05,
824
+ "loss": 0.5849,
825
+ "step": 5250
826
+ },
827
+ {
828
+ "epoch": 1.7882571283954785,
829
+ "grad_norm": 1.0699772834777832,
830
+ "learning_rate": 7.41972662287419e-05,
831
+ "loss": 0.5714,
832
+ "step": 5300
833
+ },
834
+ {
835
+ "epoch": 1.805129070355998,
836
+ "grad_norm": 1.1747502088546753,
837
+ "learning_rate": 7.244210001050232e-05,
838
+ "loss": 0.5604,
839
+ "step": 5350
840
+ },
841
+ {
842
+ "epoch": 1.8220010123165178,
843
+ "grad_norm": 1.082960605621338,
844
+ "learning_rate": 7.069607399149428e-05,
845
+ "loss": 0.5551,
846
+ "step": 5400
847
+ },
848
+ {
849
+ "epoch": 1.8388729542770372,
850
+ "grad_norm": 0.9143629670143127,
851
+ "learning_rate": 6.895976728063694e-05,
852
+ "loss": 0.5581,
853
+ "step": 5450
854
+ },
855
+ {
856
+ "epoch": 1.855744896237557,
857
+ "grad_norm": 1.0392253398895264,
858
+ "learning_rate": 6.723375576322166e-05,
859
+ "loss": 0.5506,
860
+ "step": 5500
861
+ },
862
+ {
863
+ "epoch": 1.855744896237557,
864
+ "eval_loss": 0.5858550667762756,
865
+ "eval_runtime": 296.794,
866
+ "eval_samples_per_second": 8.41,
867
+ "eval_steps_per_second": 1.051,
868
+ "step": 5500
869
+ },
870
+ {
871
+ "epoch": 1.8726168381980766,
872
+ "grad_norm": 1.1118271350860596,
873
+ "learning_rate": 6.551861190990665e-05,
874
+ "loss": 0.5508,
875
+ "step": 5550
876
+ },
877
+ {
878
+ "epoch": 1.8894887801585962,
879
+ "grad_norm": 1.0717848539352417,
880
+ "learning_rate": 6.381490458684407e-05,
881
+ "loss": 0.5489,
882
+ "step": 5600
883
+ },
884
+ {
885
+ "epoch": 1.906360722119116,
886
+ "grad_norm": 0.9712995290756226,
887
+ "learning_rate": 6.212319886700289e-05,
888
+ "loss": 0.547,
889
+ "step": 5650
890
+ },
891
+ {
892
+ "epoch": 1.9232326640796356,
893
+ "grad_norm": 1.0615884065628052,
894
+ "learning_rate": 6.044405584274961e-05,
895
+ "loss": 0.54,
896
+ "step": 5700
897
+ },
898
+ {
899
+ "epoch": 1.9401046060401552,
900
+ "grad_norm": 1.0537705421447754,
901
+ "learning_rate": 5.8778032439749284e-05,
902
+ "loss": 0.5279,
903
+ "step": 5750
904
+ },
905
+ {
906
+ "epoch": 1.9569765480006749,
907
+ "grad_norm": 1.081616759300232,
908
+ "learning_rate": 5.7125681232248684e-05,
909
+ "loss": 0.5473,
910
+ "step": 5800
911
+ },
912
+ {
913
+ "epoch": 1.9738484899611946,
914
+ "grad_norm": 1.1939841508865356,
915
+ "learning_rate": 5.548755025980237e-05,
916
+ "loss": 0.5422,
917
+ "step": 5850
918
+ },
919
+ {
920
+ "epoch": 1.9907204319217142,
921
+ "grad_norm": 1.1471023559570312,
922
+ "learning_rate": 5.3864182845503296e-05,
923
+ "loss": 0.5484,
924
+ "step": 5900
925
+ },
926
+ {
927
+ "epoch": 2.0074236544626287,
928
+ "grad_norm": 1.2721205949783325,
929
+ "learning_rate": 5.225611741577716e-05,
930
+ "loss": 0.4849,
931
+ "step": 5950
932
+ },
933
+ {
934
+ "epoch": 2.024295596423148,
935
+ "grad_norm": 1.3426976203918457,
936
+ "learning_rate": 5.066388732180136e-05,
937
+ "loss": 0.4106,
938
+ "step": 6000
939
+ },
940
+ {
941
+ "epoch": 2.024295596423148,
942
+ "eval_loss": 0.563025176525116,
943
+ "eval_runtime": 296.3567,
944
+ "eval_samples_per_second": 8.422,
945
+ "eval_steps_per_second": 1.053,
946
+ "step": 6000
947
+ },
948
+ {
949
+ "epoch": 2.041167538383668,
950
+ "grad_norm": 1.2091578245162964,
951
+ "learning_rate": 4.908802066260697e-05,
952
+ "loss": 0.3968,
953
+ "step": 6050
954
+ },
955
+ {
956
+ "epoch": 2.0580394803441875,
957
+ "grad_norm": 1.1719856262207031,
958
+ "learning_rate": 4.7529040109922584e-05,
959
+ "loss": 0.4055,
960
+ "step": 6100
961
+ },
962
+ {
963
+ "epoch": 2.0749114223047074,
964
+ "grad_norm": 1.3152216672897339,
965
+ "learning_rate": 4.598746273481881e-05,
966
+ "loss": 0.395,
967
+ "step": 6150
968
+ },
969
+ {
970
+ "epoch": 2.091783364265227,
971
+ "grad_norm": 1.2058014869689941,
972
+ "learning_rate": 4.446379983620979e-05,
973
+ "loss": 0.3895,
974
+ "step": 6200
975
+ },
976
+ {
977
+ "epoch": 2.1086553062257467,
978
+ "grad_norm": 1.1317533254623413,
979
+ "learning_rate": 4.2988477878823355e-05,
980
+ "loss": 0.4027,
981
+ "step": 6250
982
+ },
983
+ {
984
+ "epoch": 2.125527248186266,
985
+ "grad_norm": 1.241541862487793,
986
+ "learning_rate": 4.1501770661428595e-05,
987
+ "loss": 0.411,
988
+ "step": 6300
989
+ },
990
+ {
991
+ "epoch": 2.142399190146786,
992
+ "grad_norm": 1.170419454574585,
993
+ "learning_rate": 4.003446570150093e-05,
994
+ "loss": 0.3995,
995
+ "step": 6350
996
+ },
997
+ {
998
+ "epoch": 2.1592711321073055,
999
+ "grad_norm": 1.35177743434906,
1000
+ "learning_rate": 3.858704966383232e-05,
1001
+ "loss": 0.4068,
1002
+ "step": 6400
1003
+ },
1004
+ {
1005
+ "epoch": 2.1761430740678254,
1006
+ "grad_norm": 1.2839558124542236,
1007
+ "learning_rate": 3.71600026166051e-05,
1008
+ "loss": 0.4122,
1009
+ "step": 6450
1010
+ },
1011
+ {
1012
+ "epoch": 2.193015016028345,
1013
+ "grad_norm": 1.3839994668960571,
1014
+ "learning_rate": 3.575379787216629e-05,
1015
+ "loss": 0.4044,
1016
+ "step": 6500
1017
+ },
1018
+ {
1019
+ "epoch": 2.193015016028345,
1020
+ "eval_loss": 0.5397977232933044,
1021
+ "eval_runtime": 296.486,
1022
+ "eval_samples_per_second": 8.419,
1023
+ "eval_steps_per_second": 1.052,
1024
+ "step": 6500
1025
+ },
1026
+ {
1027
+ "epoch": 2.2098869579888647,
1028
+ "grad_norm": 1.2803045511245728,
1029
+ "learning_rate": 3.436890183004309e-05,
1030
+ "loss": 0.3822,
1031
+ "step": 6550
1032
+ },
1033
+ {
1034
+ "epoch": 2.226758899949384,
1035
+ "grad_norm": 1.3744126558303833,
1036
+ "learning_rate": 3.300577382225076e-05,
1037
+ "loss": 0.3915,
1038
+ "step": 6600
1039
+ },
1040
+ {
1041
+ "epoch": 2.243630841909904,
1042
+ "grad_norm": 1.1824839115142822,
1043
+ "learning_rate": 3.1664865960945e-05,
1044
+ "loss": 0.3884,
1045
+ "step": 6650
1046
+ },
1047
+ {
1048
+ "epoch": 2.2605027838704235,
1049
+ "grad_norm": 1.3948158025741577,
1050
+ "learning_rate": 3.03466229884686e-05,
1051
+ "loss": 0.382,
1052
+ "step": 6700
1053
+ },
1054
+ {
1055
+ "epoch": 2.277374725830943,
1056
+ "grad_norm": 1.2800052165985107,
1057
+ "learning_rate": 2.9051482129842577e-05,
1058
+ "loss": 0.3914,
1059
+ "step": 6750
1060
+ },
1061
+ {
1062
+ "epoch": 2.294246667791463,
1063
+ "grad_norm": 1.3162052631378174,
1064
+ "learning_rate": 2.777987294775086e-05,
1065
+ "loss": 0.3865,
1066
+ "step": 6800
1067
+ },
1068
+ {
1069
+ "epoch": 2.3111186097519827,
1070
+ "grad_norm": 1.1745421886444092,
1071
+ "learning_rate": 2.6532217200065858e-05,
1072
+ "loss": 0.3752,
1073
+ "step": 6850
1074
+ },
1075
+ {
1076
+ "epoch": 2.327990551712502,
1077
+ "grad_norm": 1.5241754055023193,
1078
+ "learning_rate": 2.5308928699963153e-05,
1079
+ "loss": 0.368,
1080
+ "step": 6900
1081
+ },
1082
+ {
1083
+ "epoch": 2.3448624936730216,
1084
+ "grad_norm": 1.3691622018814087,
1085
+ "learning_rate": 2.4110413178670878e-05,
1086
+ "loss": 0.3715,
1087
+ "step": 6950
1088
+ },
1089
+ {
1090
+ "epoch": 2.3617344356335415,
1091
+ "grad_norm": 1.206244707107544,
1092
+ "learning_rate": 2.2937068150899967e-05,
1093
+ "loss": 0.3781,
1094
+ "step": 7000
1095
+ },
1096
+ {
1097
+ "epoch": 2.3617344356335415,
1098
+ "eval_loss": 0.5177870392799377,
1099
+ "eval_runtime": 296.651,
1100
+ "eval_samples_per_second": 8.414,
1101
+ "eval_steps_per_second": 1.052,
1102
+ "step": 7000
1103
+ },
1104
+ {
1105
+ "epoch": 2.378606377594061,
1106
+ "grad_norm": 1.3172132968902588,
1107
+ "learning_rate": 2.1789282782999254e-05,
1108
+ "loss": 0.3787,
1109
+ "step": 7050
1110
+ },
1111
+ {
1112
+ "epoch": 2.395478319554581,
1113
+ "grad_norm": 1.4043761491775513,
1114
+ "learning_rate": 2.066743776387974e-05,
1115
+ "loss": 0.3798,
1116
+ "step": 7100
1117
+ },
1118
+ {
1119
+ "epoch": 2.4123502615151002,
1120
+ "grad_norm": 1.2046414613723755,
1121
+ "learning_rate": 1.957190517875064e-05,
1122
+ "loss": 0.3755,
1123
+ "step": 7150
1124
+ },
1125
+ {
1126
+ "epoch": 2.42922220347562,
1127
+ "grad_norm": 1.214189052581787,
1128
+ "learning_rate": 1.850304838570879e-05,
1129
+ "loss": 0.3722,
1130
+ "step": 7200
1131
+ },
1132
+ {
1133
+ "epoch": 2.4460941454361396,
1134
+ "grad_norm": 1.3583543300628662,
1135
+ "learning_rate": 1.7461221895222724e-05,
1136
+ "loss": 0.3667,
1137
+ "step": 7250
1138
+ },
1139
+ {
1140
+ "epoch": 2.4629660873966595,
1141
+ "grad_norm": 1.5781126022338867,
1142
+ "learning_rate": 1.644677125255143e-05,
1143
+ "loss": 0.3672,
1144
+ "step": 7300
1145
+ },
1146
+ {
1147
+ "epoch": 2.479838029357179,
1148
+ "grad_norm": 1.2086187601089478,
1149
+ "learning_rate": 1.546003292313629e-05,
1150
+ "loss": 0.358,
1151
+ "step": 7350
1152
+ },
1153
+ {
1154
+ "epoch": 2.496709971317699,
1155
+ "grad_norm": 1.1627147197723389,
1156
+ "learning_rate": 1.4501334181004889e-05,
1157
+ "loss": 0.3826,
1158
+ "step": 7400
1159
+ },
1160
+ {
1161
+ "epoch": 2.5135819132782182,
1162
+ "grad_norm": 1.4055496454238892,
1163
+ "learning_rate": 1.3570993000223043e-05,
1164
+ "loss": 0.356,
1165
+ "step": 7450
1166
+ },
1167
+ {
1168
+ "epoch": 2.530453855238738,
1169
+ "grad_norm": 1.2799688577651978,
1170
+ "learning_rate": 1.2669317949431659e-05,
1171
+ "loss": 0.3576,
1172
+ "step": 7500
1173
+ },
1174
+ {
1175
+ "epoch": 2.530453855238738,
1176
+ "eval_loss": 0.49860402941703796,
1177
+ "eval_runtime": 296.3625,
1178
+ "eval_samples_per_second": 8.422,
1179
+ "eval_steps_per_second": 1.053,
1180
+ "step": 7500
1181
+ },
1182
+ {
1183
+ "epoch": 2.5473257971992576,
1184
+ "grad_norm": 1.321845531463623,
1185
+ "learning_rate": 1.1796608089502948e-05,
1186
+ "loss": 0.3661,
1187
+ "step": 7550
1188
+ },
1189
+ {
1190
+ "epoch": 2.5641977391597774,
1191
+ "grad_norm": 1.3550702333450317,
1192
+ "learning_rate": 1.0953152874350059e-05,
1193
+ "loss": 0.365,
1194
+ "step": 7600
1195
+ },
1196
+ {
1197
+ "epoch": 2.581069681120297,
1198
+ "grad_norm": 1.2584900856018066,
1199
+ "learning_rate": 1.0139232054923287e-05,
1200
+ "loss": 0.3535,
1201
+ "step": 7650
1202
+ },
1203
+ {
1204
+ "epoch": 2.5979416230808168,
1205
+ "grad_norm": 1.376833438873291,
1206
+ "learning_rate": 9.355115586424224e-06,
1207
+ "loss": 0.3502,
1208
+ "step": 7700
1209
+ },
1210
+ {
1211
+ "epoch": 2.614813565041336,
1212
+ "grad_norm": 1.1930843591690063,
1213
+ "learning_rate": 8.601063538769182e-06,
1214
+ "loss": 0.3705,
1215
+ "step": 7750
1216
+ },
1217
+ {
1218
+ "epoch": 2.6316855070018557,
1219
+ "grad_norm": 1.4217926263809204,
1220
+ "learning_rate": 7.877326010330977e-06,
1221
+ "loss": 0.3459,
1222
+ "step": 7800
1223
+ },
1224
+ {
1225
+ "epoch": 2.6485574489623755,
1226
+ "grad_norm": 1.2762850522994995,
1227
+ "learning_rate": 7.1841430449882895e-06,
1228
+ "loss": 0.3436,
1229
+ "step": 7850
1230
+ },
1231
+ {
1232
+ "epoch": 2.6654293909228954,
1233
+ "grad_norm": 1.2758060693740845,
1234
+ "learning_rate": 6.521744552509635e-06,
1235
+ "loss": 0.3575,
1236
+ "step": 7900
1237
+ },
1238
+ {
1239
+ "epoch": 2.682301332883415,
1240
+ "grad_norm": 1.4101016521453857,
1241
+ "learning_rate": 5.890350232298591e-06,
1242
+ "loss": 0.3513,
1243
+ "step": 7950
1244
+ },
1245
+ {
1246
+ "epoch": 2.6991732748439343,
1247
+ "grad_norm": 1.1934149265289307,
1248
+ "learning_rate": 5.290169500525577e-06,
1249
+ "loss": 0.3472,
1250
+ "step": 8000
1251
+ },
1252
+ {
1253
+ "epoch": 2.6991732748439343,
1254
+ "eval_loss": 0.4890361726284027,
1255
+ "eval_runtime": 296.7147,
1256
+ "eval_samples_per_second": 8.412,
1257
+ "eval_steps_per_second": 1.052,
1258
+ "step": 8000
1259
+ },
1260
+ {
1261
+ "epoch": 2.716045216804454,
1262
+ "grad_norm": 1.1966798305511475,
1263
+ "learning_rate": 4.721401420670224e-06,
1264
+ "loss": 0.3476,
1265
+ "step": 8050
1266
+ },
1267
+ {
1268
+ "epoch": 2.732917158764974,
1269
+ "grad_norm": 1.326464295387268,
1270
+ "learning_rate": 4.184234637497486e-06,
1271
+ "loss": 0.3607,
1272
+ "step": 8100
1273
+ },
1274
+ {
1275
+ "epoch": 2.7497891007254935,
1276
+ "grad_norm": 1.1875063180923462,
1277
+ "learning_rate": 3.6788473144893976e-06,
1278
+ "loss": 0.3421,
1279
+ "step": 8150
1280
+ },
1281
+ {
1282
+ "epoch": 2.766661042686013,
1283
+ "grad_norm": 1.318291425704956,
1284
+ "learning_rate": 3.20540707475302e-06,
1285
+ "loss": 0.3567,
1286
+ "step": 8200
1287
+ },
1288
+ {
1289
+ "epoch": 2.783532984646533,
1290
+ "grad_norm": 1.3796924352645874,
1291
+ "learning_rate": 2.7640709454245904e-06,
1292
+ "loss": 0.3543,
1293
+ "step": 8250
1294
+ },
1295
+ {
1296
+ "epoch": 2.8004049266070523,
1297
+ "grad_norm": 1.188733696937561,
1298
+ "learning_rate": 2.3549853055878314e-06,
1299
+ "loss": 0.3461,
1300
+ "step": 8300
1301
+ },
1302
+ {
1303
+ "epoch": 2.817276868567572,
1304
+ "grad_norm": 1.4971429109573364,
1305
+ "learning_rate": 1.978285837724092e-06,
1306
+ "loss": 0.345,
1307
+ "step": 8350
1308
+ },
1309
+ {
1310
+ "epoch": 2.8341488105280916,
1311
+ "grad_norm": 1.218836784362793,
1312
+ "learning_rate": 1.6340974827101286e-06,
1313
+ "loss": 0.3628,
1314
+ "step": 8400
1315
+ },
1316
+ {
1317
+ "epoch": 2.8510207524886115,
1318
+ "grad_norm": 1.3097290992736816,
1319
+ "learning_rate": 1.3225343983787054e-06,
1320
+ "loss": 0.3515,
1321
+ "step": 8450
1322
+ },
1323
+ {
1324
+ "epoch": 2.867892694449131,
1325
+ "grad_norm": 1.602677822113037,
1326
+ "learning_rate": 1.0436999216555276e-06,
1327
+ "loss": 0.3506,
1328
+ "step": 8500
1329
+ },
1330
+ {
1331
+ "epoch": 2.867892694449131,
1332
+ "eval_loss": 0.48525503277778625,
1333
+ "eval_runtime": 296.1461,
1334
+ "eval_samples_per_second": 8.428,
1335
+ "eval_steps_per_second": 1.054,
1336
+ "step": 8500
1337
+ },
1338
+ {
1339
+ "epoch": 2.884764636409651,
1340
+ "grad_norm": 1.332733392715454,
1341
+ "learning_rate": 7.976865342852469e-07,
1342
+ "loss": 0.3562,
1343
+ "step": 8550
1344
+ },
1345
+ {
1346
+ "epoch": 2.9016365783701703,
1347
+ "grad_norm": 1.2944486141204834,
1348
+ "learning_rate": 5.845758321577855e-07,
1349
+ "loss": 0.3544,
1350
+ "step": 8600
1351
+ },
1352
+ {
1353
+ "epoch": 2.91850852033069,
1354
+ "grad_norm": 1.2164134979248047,
1355
+ "learning_rate": 4.0443849824522985e-07,
1356
+ "loss": 0.3432,
1357
+ "step": 8650
1358
+ },
1359
+ {
1360
+ "epoch": 2.9353804622912096,
1361
+ "grad_norm": 1.24102783203125,
1362
+ "learning_rate": 2.5733427915823894e-07,
1363
+ "loss": 0.3481,
1364
+ "step": 8700
1365
+ },
1366
+ {
1367
+ "epoch": 2.9522524042517295,
1368
+ "grad_norm": 1.3387184143066406,
1369
+ "learning_rate": 1.433119653297177e-07,
1370
+ "loss": 0.3533,
1371
+ "step": 8750
1372
+ },
1373
+ {
1374
+ "epoch": 2.969124346212249,
1375
+ "grad_norm": 1.2963091135025024,
1376
+ "learning_rate": 6.240937483235066e-08,
1377
+ "loss": 0.3456,
1378
+ "step": 8800
1379
+ },
1380
+ {
1381
+ "epoch": 2.9859962881727684,
1382
+ "grad_norm": 1.0874204635620117,
1383
+ "learning_rate": 1.4653340835435458e-08,
1384
+ "loss": 0.3543,
1385
+ "step": 8850
1386
+ }
1387
+ ],
1388
+ "logging_steps": 50,
1389
+ "max_steps": 8892,
1390
+ "num_input_tokens_seen": 0,
1391
+ "num_train_epochs": 3,
1392
+ "save_steps": 500,
1393
+ "stateful_callbacks": {
1394
+ "TrainerControl": {
1395
+ "args": {
1396
+ "should_epoch_stop": false,
1397
+ "should_evaluate": false,
1398
+ "should_log": false,
1399
+ "should_save": true,
1400
+ "should_training_stop": true
1401
+ },
1402
+ "attributes": {}
1403
+ }
1404
+ },
1405
+ "total_flos": 6.169366398084317e+18,
1406
+ "train_batch_size": 8,
1407
+ "trial_name": null,
1408
+ "trial_params": null
1409
+ }
checkpoint-8892/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89962928316ba9167e9dc9efd4961d21c08e95c6bd6d9ad576dbd0f3c12dcdaf
3
+ size 5688
special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
3
+ size 493443
tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89962928316ba9167e9dc9efd4961d21c08e95c6bd6d9ad576dbd0f3c12dcdaf
3
+ size 5688
training_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"base_model": "mistralai/Mistral-7B-Instruct-v0.2"}