SamChen888 commited on
Commit
ddc5e5d
·
verified ·
1 Parent(s): 86e89e7

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. README.md +59 -3
  2. adapter_config.json +34 -0
  3. adapter_model.safetensors +3 -0
  4. all_results.json +9 -0
  5. checkpoint-1000/README.md +202 -0
  6. checkpoint-1000/adapter_config.json +34 -0
  7. checkpoint-1000/adapter_model.safetensors +3 -0
  8. checkpoint-1000/optimizer.pt +3 -0
  9. checkpoint-1000/rng_state.pth +3 -0
  10. checkpoint-1000/scheduler.pt +3 -0
  11. checkpoint-1000/special_tokens_map.json +24 -0
  12. checkpoint-1000/tokenizer.json +0 -0
  13. checkpoint-1000/tokenizer.model +3 -0
  14. checkpoint-1000/tokenizer_config.json +0 -0
  15. checkpoint-1000/trainer_state.json +833 -0
  16. checkpoint-1000/training_args.bin +3 -0
  17. checkpoint-2000/README.md +202 -0
  18. checkpoint-2000/adapter_config.json +34 -0
  19. checkpoint-2000/adapter_model.safetensors +3 -0
  20. checkpoint-2000/optimizer.pt +3 -0
  21. checkpoint-2000/rng_state.pth +3 -0
  22. checkpoint-2000/scheduler.pt +3 -0
  23. checkpoint-2000/special_tokens_map.json +24 -0
  24. checkpoint-2000/tokenizer.json +0 -0
  25. checkpoint-2000/tokenizer.model +3 -0
  26. checkpoint-2000/tokenizer_config.json +0 -0
  27. checkpoint-2000/trainer_state.json +1633 -0
  28. checkpoint-2000/training_args.bin +3 -0
  29. checkpoint-3000/README.md +202 -0
  30. checkpoint-3000/adapter_config.json +34 -0
  31. checkpoint-3000/adapter_model.safetensors +3 -0
  32. checkpoint-3000/optimizer.pt +3 -0
  33. checkpoint-3000/rng_state.pth +3 -0
  34. checkpoint-3000/scheduler.pt +3 -0
  35. checkpoint-3000/special_tokens_map.json +24 -0
  36. checkpoint-3000/tokenizer.json +0 -0
  37. checkpoint-3000/tokenizer.model +3 -0
  38. checkpoint-3000/tokenizer_config.json +0 -0
  39. checkpoint-3000/trainer_state.json +2433 -0
  40. checkpoint-3000/training_args.bin +3 -0
  41. checkpoint-3096/README.md +202 -0
  42. checkpoint-3096/adapter_config.json +34 -0
  43. checkpoint-3096/adapter_model.safetensors +3 -0
  44. checkpoint-3096/optimizer.pt +3 -0
  45. checkpoint-3096/rng_state.pth +3 -0
  46. checkpoint-3096/scheduler.pt +3 -0
  47. checkpoint-3096/special_tokens_map.json +24 -0
  48. checkpoint-3096/tokenizer.json +0 -0
  49. checkpoint-3096/tokenizer.model +3 -0
  50. checkpoint-3096/tokenizer_config.json +0 -0
README.md CHANGED
@@ -1,3 +1,59 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ license: other
4
+ base_model: unsloth/mistral-7b-instruct-v0.3-bnb-4bit
5
+ tags:
6
+ - llama-factory
7
+ - lora
8
+ - unsloth
9
+ - generated_from_trainer
10
+ model-index:
11
+ - name: cdb_polyhope_trained
12
+ results: []
13
+ ---
14
+
15
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
16
+ should probably proofread and complete it, then remove this comment. -->
17
+
18
+ # cdb_polyhope_trained
19
+
20
+ This model is a fine-tuned version of [mistralai/Mistral-7B-Instruct-v0.3](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3) on the train_cdb_polyhope dataset.
21
+
22
+ ## Model description
23
+
24
+ More information needed
25
+
26
+ ## Intended uses & limitations
27
+
28
+ More information needed
29
+
30
+ ## Training and evaluation data
31
+
32
+ More information needed
33
+
34
+ ## Training procedure
35
+
36
+ ### Training hyperparameters
37
+
38
+ The following hyperparameters were used during training:
39
+ - learning_rate: 3e-05
40
+ - train_batch_size: 4
41
+ - eval_batch_size: 8
42
+ - seed: 42
43
+ - gradient_accumulation_steps: 2
44
+ - total_train_batch_size: 8
45
+ - optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
46
+ - lr_scheduler_type: cosine
47
+ - num_epochs: 3.0
48
+
49
+ ### Training results
50
+
51
+
52
+
53
+ ### Framework versions
54
+
55
+ - PEFT 0.12.0
56
+ - Transformers 4.49.0
57
+ - Pytorch 2.6.0+cu124
58
+ - Datasets 3.2.0
59
+ - Tokenizers 0.21.0
adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 8,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "up_proj",
24
+ "gate_proj",
25
+ "q_proj",
26
+ "down_proj",
27
+ "v_proj",
28
+ "k_proj",
29
+ "o_proj"
30
+ ],
31
+ "task_type": "CAUSAL_LM",
32
+ "use_dora": false,
33
+ "use_rslora": false
34
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e945a3ed8fb6a8e001cf93765506fa4f4e8a5f26f7e25358e3647e1e889d3d2
3
+ size 83945296
all_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.9975786924939465,
3
+ "num_input_tokens_seen": 7706072,
4
+ "total_flos": 3.298866475009966e+17,
5
+ "train_loss": 0.08281170262038245,
6
+ "train_runtime": 2763.8125,
7
+ "train_samples_per_second": 8.963,
8
+ "train_steps_per_second": 1.12
9
+ }
checkpoint-1000/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: unsloth/mistral-7b-instruct-v0.3-bnb-4bit
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.12.0
checkpoint-1000/adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 8,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "up_proj",
24
+ "gate_proj",
25
+ "q_proj",
26
+ "down_proj",
27
+ "v_proj",
28
+ "k_proj",
29
+ "o_proj"
30
+ ],
31
+ "task_type": "CAUSAL_LM",
32
+ "use_dora": false,
33
+ "use_rslora": false
34
+ }
checkpoint-1000/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:22113c08ccf674cc5c04de1ffd8d20cdc39eb799b723881de471618a23755ff3
3
+ size 83945296
checkpoint-1000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92820e3a359bfbd9c00a7c84c02e9fe730ee77af64ac4949dada1f627457b692
3
+ size 168149074
checkpoint-1000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:386fcc8cc1089aade9450d86fb239ea3483f455fd2d78d8378645feecfec9d69
3
+ size 14244
checkpoint-1000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2f428295c7a74e082df84f05c5059fc45b17341d5982cd437d20453cec60a6d
3
+ size 1064
checkpoint-1000/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
checkpoint-1000/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-1000/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37f00374dea48658ee8f5d0f21895b9bc55cb0103939607c8185bfd1c6ca1f89
3
+ size 587404
checkpoint-1000/tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-1000/trainer_state.json ADDED
@@ -0,0 +1,833 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.9685230024213075,
5
+ "eval_steps": 500,
6
+ "global_step": 1000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.009685230024213076,
13
+ "grad_norm": 6.778852939605713,
14
+ "learning_rate": 2.9999227754514262e-05,
15
+ "loss": 0.8519,
16
+ "num_input_tokens_seen": 25568,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 0.01937046004842615,
21
+ "grad_norm": 3.0029561519622803,
22
+ "learning_rate": 2.9996911097572118e-05,
23
+ "loss": 0.189,
24
+ "num_input_tokens_seen": 51072,
25
+ "step": 20
26
+ },
27
+ {
28
+ "epoch": 0.029055690072639227,
29
+ "grad_norm": 5.477710247039795,
30
+ "learning_rate": 2.9993050267710624e-05,
31
+ "loss": 0.1648,
32
+ "num_input_tokens_seen": 76416,
33
+ "step": 30
34
+ },
35
+ {
36
+ "epoch": 0.0387409200968523,
37
+ "grad_norm": 4.35634183883667,
38
+ "learning_rate": 2.9987645662464235e-05,
39
+ "loss": 0.1905,
40
+ "num_input_tokens_seen": 101344,
41
+ "step": 40
42
+ },
43
+ {
44
+ "epoch": 0.048426150121065374,
45
+ "grad_norm": 4.523565292358398,
46
+ "learning_rate": 2.9980697838323884e-05,
47
+ "loss": 0.1794,
48
+ "num_input_tokens_seen": 126656,
49
+ "step": 50
50
+ },
51
+ {
52
+ "epoch": 0.05811138014527845,
53
+ "grad_norm": 1.9348187446594238,
54
+ "learning_rate": 2.9972207510679677e-05,
55
+ "loss": 0.1528,
56
+ "num_input_tokens_seen": 151200,
57
+ "step": 60
58
+ },
59
+ {
60
+ "epoch": 0.06779661016949153,
61
+ "grad_norm": 2.981433629989624,
62
+ "learning_rate": 2.996217555374725e-05,
63
+ "loss": 0.1742,
64
+ "num_input_tokens_seen": 175968,
65
+ "step": 70
66
+ },
67
+ {
68
+ "epoch": 0.0774818401937046,
69
+ "grad_norm": 3.6294591426849365,
70
+ "learning_rate": 2.9950603000477722e-05,
71
+ "loss": 0.1565,
72
+ "num_input_tokens_seen": 201280,
73
+ "step": 80
74
+ },
75
+ {
76
+ "epoch": 0.08716707021791767,
77
+ "grad_norm": 2.5459301471710205,
78
+ "learning_rate": 2.993749104245137e-05,
79
+ "loss": 0.1499,
80
+ "num_input_tokens_seen": 226432,
81
+ "step": 90
82
+ },
83
+ {
84
+ "epoch": 0.09685230024213075,
85
+ "grad_norm": 2.2721059322357178,
86
+ "learning_rate": 2.992284102975491e-05,
87
+ "loss": 0.1441,
88
+ "num_input_tokens_seen": 251744,
89
+ "step": 100
90
+ },
91
+ {
92
+ "epoch": 0.10653753026634383,
93
+ "grad_norm": 2.0033624172210693,
94
+ "learning_rate": 2.9906654470842492e-05,
95
+ "loss": 0.1245,
96
+ "num_input_tokens_seen": 276480,
97
+ "step": 110
98
+ },
99
+ {
100
+ "epoch": 0.1162227602905569,
101
+ "grad_norm": 8.585118293762207,
102
+ "learning_rate": 2.9888933032380397e-05,
103
+ "loss": 0.1333,
104
+ "num_input_tokens_seen": 301664,
105
+ "step": 120
106
+ },
107
+ {
108
+ "epoch": 0.12590799031476999,
109
+ "grad_norm": 1.423967719078064,
110
+ "learning_rate": 2.9869678539075403e-05,
111
+ "loss": 0.1728,
112
+ "num_input_tokens_seen": 326784,
113
+ "step": 130
114
+ },
115
+ {
116
+ "epoch": 0.13559322033898305,
117
+ "grad_norm": 2.6306211948394775,
118
+ "learning_rate": 2.9848892973486912e-05,
119
+ "loss": 0.1281,
120
+ "num_input_tokens_seen": 351328,
121
+ "step": 140
122
+ },
123
+ {
124
+ "epoch": 0.14527845036319612,
125
+ "grad_norm": 2.5618090629577637,
126
+ "learning_rate": 2.9826578475822825e-05,
127
+ "loss": 0.1136,
128
+ "num_input_tokens_seen": 376000,
129
+ "step": 150
130
+ },
131
+ {
132
+ "epoch": 0.1549636803874092,
133
+ "grad_norm": 2.694077730178833,
134
+ "learning_rate": 2.980273734371914e-05,
135
+ "loss": 0.1277,
136
+ "num_input_tokens_seen": 400384,
137
+ "step": 160
138
+ },
139
+ {
140
+ "epoch": 0.16464891041162227,
141
+ "grad_norm": 2.632338047027588,
142
+ "learning_rate": 2.9777372032003423e-05,
143
+ "loss": 0.1028,
144
+ "num_input_tokens_seen": 426432,
145
+ "step": 170
146
+ },
147
+ {
148
+ "epoch": 0.17433414043583534,
149
+ "grad_norm": 2.3446829319000244,
150
+ "learning_rate": 2.975048515244199e-05,
151
+ "loss": 0.1245,
152
+ "num_input_tokens_seen": 451712,
153
+ "step": 180
154
+ },
155
+ {
156
+ "epoch": 0.18401937046004843,
157
+ "grad_norm": 1.8457319736480713,
158
+ "learning_rate": 2.9722079473471035e-05,
159
+ "loss": 0.142,
160
+ "num_input_tokens_seen": 476960,
161
+ "step": 190
162
+ },
163
+ {
164
+ "epoch": 0.1937046004842615,
165
+ "grad_norm": 1.8676010370254517,
166
+ "learning_rate": 2.9692157919911536e-05,
167
+ "loss": 0.1342,
168
+ "num_input_tokens_seen": 501440,
169
+ "step": 200
170
+ },
171
+ {
172
+ "epoch": 0.2033898305084746,
173
+ "grad_norm": 4.593673229217529,
174
+ "learning_rate": 2.966072357266811e-05,
175
+ "loss": 0.1314,
176
+ "num_input_tokens_seen": 526656,
177
+ "step": 210
178
+ },
179
+ {
180
+ "epoch": 0.21307506053268765,
181
+ "grad_norm": 3.9568676948547363,
182
+ "learning_rate": 2.9627779668411795e-05,
183
+ "loss": 0.171,
184
+ "num_input_tokens_seen": 552544,
185
+ "step": 220
186
+ },
187
+ {
188
+ "epoch": 0.22276029055690072,
189
+ "grad_norm": 2.4331846237182617,
190
+ "learning_rate": 2.9593329599246766e-05,
191
+ "loss": 0.115,
192
+ "num_input_tokens_seen": 577472,
193
+ "step": 230
194
+ },
195
+ {
196
+ "epoch": 0.2324455205811138,
197
+ "grad_norm": 2.525543212890625,
198
+ "learning_rate": 2.955737691236108e-05,
199
+ "loss": 0.1158,
200
+ "num_input_tokens_seen": 601856,
201
+ "step": 240
202
+ },
203
+ {
204
+ "epoch": 0.24213075060532688,
205
+ "grad_norm": 2.2355105876922607,
206
+ "learning_rate": 2.9519925309661422e-05,
207
+ "loss": 0.111,
208
+ "num_input_tokens_seen": 627904,
209
+ "step": 250
210
+ },
211
+ {
212
+ "epoch": 0.25181598062953997,
213
+ "grad_norm": 4.165389537811279,
214
+ "learning_rate": 2.948097864739194e-05,
215
+ "loss": 0.1314,
216
+ "num_input_tokens_seen": 651936,
217
+ "step": 260
218
+ },
219
+ {
220
+ "epoch": 0.26150121065375304,
221
+ "grad_norm": 3.1712851524353027,
222
+ "learning_rate": 2.944054093573719e-05,
223
+ "loss": 0.143,
224
+ "num_input_tokens_seen": 676416,
225
+ "step": 270
226
+ },
227
+ {
228
+ "epoch": 0.2711864406779661,
229
+ "grad_norm": 2.881716728210449,
230
+ "learning_rate": 2.93986163384092e-05,
231
+ "loss": 0.1121,
232
+ "num_input_tokens_seen": 700832,
233
+ "step": 280
234
+ },
235
+ {
236
+ "epoch": 0.28087167070217917,
237
+ "grad_norm": 3.060872793197632,
238
+ "learning_rate": 2.9355209172218777e-05,
239
+ "loss": 0.1159,
240
+ "num_input_tokens_seen": 725824,
241
+ "step": 290
242
+ },
243
+ {
244
+ "epoch": 0.29055690072639223,
245
+ "grad_norm": 4.449444770812988,
246
+ "learning_rate": 2.931032390663101e-05,
247
+ "loss": 0.133,
248
+ "num_input_tokens_seen": 749408,
249
+ "step": 300
250
+ },
251
+ {
252
+ "epoch": 0.30024213075060535,
253
+ "grad_norm": 5.323568344116211,
254
+ "learning_rate": 2.926396516330506e-05,
255
+ "loss": 0.1172,
256
+ "num_input_tokens_seen": 773984,
257
+ "step": 310
258
+ },
259
+ {
260
+ "epoch": 0.3099273607748184,
261
+ "grad_norm": 3.144500732421875,
262
+ "learning_rate": 2.921613771561829e-05,
263
+ "loss": 0.136,
264
+ "num_input_tokens_seen": 799168,
265
+ "step": 320
266
+ },
267
+ {
268
+ "epoch": 0.3196125907990315,
269
+ "grad_norm": 2.433586359024048,
270
+ "learning_rate": 2.916684648817478e-05,
271
+ "loss": 0.0973,
272
+ "num_input_tokens_seen": 824320,
273
+ "step": 330
274
+ },
275
+ {
276
+ "epoch": 0.32929782082324455,
277
+ "grad_norm": 3.349472761154175,
278
+ "learning_rate": 2.9116096556298256e-05,
279
+ "loss": 0.13,
280
+ "num_input_tokens_seen": 849632,
281
+ "step": 340
282
+ },
283
+ {
284
+ "epoch": 0.3389830508474576,
285
+ "grad_norm": 1.8927061557769775,
286
+ "learning_rate": 2.9063893145509475e-05,
287
+ "loss": 0.1257,
288
+ "num_input_tokens_seen": 874400,
289
+ "step": 350
290
+ },
291
+ {
292
+ "epoch": 0.3486682808716707,
293
+ "grad_norm": 3.972686529159546,
294
+ "learning_rate": 2.901024163098822e-05,
295
+ "loss": 0.1155,
296
+ "num_input_tokens_seen": 899264,
297
+ "step": 360
298
+ },
299
+ {
300
+ "epoch": 0.3583535108958838,
301
+ "grad_norm": 1.177282452583313,
302
+ "learning_rate": 2.8955147537019815e-05,
303
+ "loss": 0.1251,
304
+ "num_input_tokens_seen": 924544,
305
+ "step": 370
306
+ },
307
+ {
308
+ "epoch": 0.36803874092009686,
309
+ "grad_norm": 1.9911576509475708,
310
+ "learning_rate": 2.88986165364263e-05,
311
+ "loss": 0.1147,
312
+ "num_input_tokens_seen": 949792,
313
+ "step": 380
314
+ },
315
+ {
316
+ "epoch": 0.37772397094430993,
317
+ "grad_norm": 2.402615785598755,
318
+ "learning_rate": 2.8840654449982344e-05,
319
+ "loss": 0.1433,
320
+ "num_input_tokens_seen": 974112,
321
+ "step": 390
322
+ },
323
+ {
324
+ "epoch": 0.387409200968523,
325
+ "grad_norm": 1.3184998035430908,
326
+ "learning_rate": 2.8781267245815898e-05,
327
+ "loss": 0.1117,
328
+ "num_input_tokens_seen": 999168,
329
+ "step": 400
330
+ },
331
+ {
332
+ "epoch": 0.39709443099273606,
333
+ "grad_norm": 1.9284625053405762,
334
+ "learning_rate": 2.8720461038793672e-05,
335
+ "loss": 0.1353,
336
+ "num_input_tokens_seen": 1024320,
337
+ "step": 410
338
+ },
339
+ {
340
+ "epoch": 0.4067796610169492,
341
+ "grad_norm": 3.1020259857177734,
342
+ "learning_rate": 2.8658242089891515e-05,
343
+ "loss": 0.1165,
344
+ "num_input_tokens_seen": 1049088,
345
+ "step": 420
346
+ },
347
+ {
348
+ "epoch": 0.41646489104116224,
349
+ "grad_norm": 2.203179359436035,
350
+ "learning_rate": 2.8594616805549752e-05,
351
+ "loss": 0.1215,
352
+ "num_input_tokens_seen": 1073632,
353
+ "step": 430
354
+ },
355
+ {
356
+ "epoch": 0.4261501210653753,
357
+ "grad_norm": 2.053194522857666,
358
+ "learning_rate": 2.8529591737013526e-05,
359
+ "loss": 0.1066,
360
+ "num_input_tokens_seen": 1098208,
361
+ "step": 440
362
+ },
363
+ {
364
+ "epoch": 0.4358353510895884,
365
+ "grad_norm": 2.780935049057007,
366
+ "learning_rate": 2.8463173579658258e-05,
367
+ "loss": 0.0879,
368
+ "num_input_tokens_seen": 1122336,
369
+ "step": 450
370
+ },
371
+ {
372
+ "epoch": 0.44552058111380144,
373
+ "grad_norm": 1.9929611682891846,
374
+ "learning_rate": 2.8395369172300235e-05,
375
+ "loss": 0.1141,
376
+ "num_input_tokens_seen": 1147392,
377
+ "step": 460
378
+ },
379
+ {
380
+ "epoch": 0.4552058111380145,
381
+ "grad_norm": 1.1469779014587402,
382
+ "learning_rate": 2.8326185496492464e-05,
383
+ "loss": 0.1052,
384
+ "num_input_tokens_seen": 1173248,
385
+ "step": 470
386
+ },
387
+ {
388
+ "epoch": 0.4648910411622276,
389
+ "grad_norm": 2.501117706298828,
390
+ "learning_rate": 2.825562967580579e-05,
391
+ "loss": 0.1086,
392
+ "num_input_tokens_seen": 1197984,
393
+ "step": 480
394
+ },
395
+ {
396
+ "epoch": 0.4745762711864407,
397
+ "grad_norm": 2.0266308784484863,
398
+ "learning_rate": 2.8183708975095406e-05,
399
+ "loss": 0.1201,
400
+ "num_input_tokens_seen": 1222720,
401
+ "step": 490
402
+ },
403
+ {
404
+ "epoch": 0.48426150121065376,
405
+ "grad_norm": 1.1120251417160034,
406
+ "learning_rate": 2.8110430799752845e-05,
407
+ "loss": 0.1319,
408
+ "num_input_tokens_seen": 1247232,
409
+ "step": 500
410
+ },
411
+ {
412
+ "epoch": 0.4939467312348668,
413
+ "grad_norm": 1.2014496326446533,
414
+ "learning_rate": 2.8035802694943457e-05,
415
+ "loss": 0.1071,
416
+ "num_input_tokens_seen": 1273184,
417
+ "step": 510
418
+ },
419
+ {
420
+ "epoch": 0.5036319612590799,
421
+ "grad_norm": 1.1245245933532715,
422
+ "learning_rate": 2.7959832344829512e-05,
423
+ "loss": 0.1554,
424
+ "num_input_tokens_seen": 1298688,
425
+ "step": 520
426
+ },
427
+ {
428
+ "epoch": 0.513317191283293,
429
+ "grad_norm": 2.031115770339966,
430
+ "learning_rate": 2.7882527571779003e-05,
431
+ "loss": 0.1196,
432
+ "num_input_tokens_seen": 1324128,
433
+ "step": 530
434
+ },
435
+ {
436
+ "epoch": 0.5230024213075061,
437
+ "grad_norm": 1.7691289186477661,
438
+ "learning_rate": 2.78038963355602e-05,
439
+ "loss": 0.1334,
440
+ "num_input_tokens_seen": 1349120,
441
+ "step": 540
442
+ },
443
+ {
444
+ "epoch": 0.5326876513317191,
445
+ "grad_norm": 2.9496989250183105,
446
+ "learning_rate": 2.7723946732522055e-05,
447
+ "loss": 0.1109,
448
+ "num_input_tokens_seen": 1374304,
449
+ "step": 550
450
+ },
451
+ {
452
+ "epoch": 0.5423728813559322,
453
+ "grad_norm": 2.2881715297698975,
454
+ "learning_rate": 2.764268699476058e-05,
455
+ "loss": 0.1274,
456
+ "num_input_tokens_seen": 1399136,
457
+ "step": 560
458
+ },
459
+ {
460
+ "epoch": 0.5520581113801453,
461
+ "grad_norm": 1.9754095077514648,
462
+ "learning_rate": 2.756012548927119e-05,
463
+ "loss": 0.1397,
464
+ "num_input_tokens_seen": 1424672,
465
+ "step": 570
466
+ },
467
+ {
468
+ "epoch": 0.5617433414043583,
469
+ "grad_norm": 1.9883428812026978,
470
+ "learning_rate": 2.7476270717087215e-05,
471
+ "loss": 0.101,
472
+ "num_input_tokens_seen": 1449024,
473
+ "step": 580
474
+ },
475
+ {
476
+ "epoch": 0.5714285714285714,
477
+ "grad_norm": 0.9653130769729614,
478
+ "learning_rate": 2.7391131312404556e-05,
479
+ "loss": 0.0941,
480
+ "num_input_tokens_seen": 1475264,
481
+ "step": 590
482
+ },
483
+ {
484
+ "epoch": 0.5811138014527845,
485
+ "grad_norm": 4.576601028442383,
486
+ "learning_rate": 2.7304716041692663e-05,
487
+ "loss": 0.0865,
488
+ "num_input_tokens_seen": 1500064,
489
+ "step": 600
490
+ },
491
+ {
492
+ "epoch": 0.5907990314769975,
493
+ "grad_norm": 2.4046311378479004,
494
+ "learning_rate": 2.7217033802791906e-05,
495
+ "loss": 0.1596,
496
+ "num_input_tokens_seen": 1524448,
497
+ "step": 610
498
+ },
499
+ {
500
+ "epoch": 0.6004842615012107,
501
+ "grad_norm": 1.7785555124282837,
502
+ "learning_rate": 2.7128093623997368e-05,
503
+ "loss": 0.0891,
504
+ "num_input_tokens_seen": 1549536,
505
+ "step": 620
506
+ },
507
+ {
508
+ "epoch": 0.6101694915254238,
509
+ "grad_norm": 2.2736170291900635,
510
+ "learning_rate": 2.7037904663129262e-05,
511
+ "loss": 0.1085,
512
+ "num_input_tokens_seen": 1573408,
513
+ "step": 630
514
+ },
515
+ {
516
+ "epoch": 0.6198547215496368,
517
+ "grad_norm": 1.0862345695495605,
518
+ "learning_rate": 2.6946476206589972e-05,
519
+ "loss": 0.1023,
520
+ "num_input_tokens_seen": 1597888,
521
+ "step": 640
522
+ },
523
+ {
524
+ "epoch": 0.6295399515738499,
525
+ "grad_norm": 0.5358290672302246,
526
+ "learning_rate": 2.6853817668407875e-05,
527
+ "loss": 0.0669,
528
+ "num_input_tokens_seen": 1623296,
529
+ "step": 650
530
+ },
531
+ {
532
+ "epoch": 0.639225181598063,
533
+ "grad_norm": 2.3138749599456787,
534
+ "learning_rate": 2.6759938589268023e-05,
535
+ "loss": 0.1017,
536
+ "num_input_tokens_seen": 1649216,
537
+ "step": 660
538
+ },
539
+ {
540
+ "epoch": 0.648910411622276,
541
+ "grad_norm": 3.2054226398468018,
542
+ "learning_rate": 2.6664848635529742e-05,
543
+ "loss": 0.1432,
544
+ "num_input_tokens_seen": 1673760,
545
+ "step": 670
546
+ },
547
+ {
548
+ "epoch": 0.6585956416464891,
549
+ "grad_norm": 1.8352829217910767,
550
+ "learning_rate": 2.6568557598231385e-05,
551
+ "loss": 0.1081,
552
+ "num_input_tokens_seen": 1698592,
553
+ "step": 680
554
+ },
555
+ {
556
+ "epoch": 0.6682808716707022,
557
+ "grad_norm": 1.203284740447998,
558
+ "learning_rate": 2.6471075392082125e-05,
559
+ "loss": 0.1037,
560
+ "num_input_tokens_seen": 1723296,
561
+ "step": 690
562
+ },
563
+ {
564
+ "epoch": 0.6779661016949152,
565
+ "grad_norm": 1.635628581047058,
566
+ "learning_rate": 2.6372412054441116e-05,
567
+ "loss": 0.1216,
568
+ "num_input_tokens_seen": 1748384,
569
+ "step": 700
570
+ },
571
+ {
572
+ "epoch": 0.6876513317191283,
573
+ "grad_norm": 0.8993457555770874,
574
+ "learning_rate": 2.6272577744283965e-05,
575
+ "loss": 0.0853,
576
+ "num_input_tokens_seen": 1773600,
577
+ "step": 710
578
+ },
579
+ {
580
+ "epoch": 0.6973365617433414,
581
+ "grad_norm": 1.7306419610977173,
582
+ "learning_rate": 2.617158274115673e-05,
583
+ "loss": 0.1034,
584
+ "num_input_tokens_seen": 1798656,
585
+ "step": 720
586
+ },
587
+ {
588
+ "epoch": 0.7070217917675545,
589
+ "grad_norm": 2.770066976547241,
590
+ "learning_rate": 2.6069437444117432e-05,
591
+ "loss": 0.0872,
592
+ "num_input_tokens_seen": 1824544,
593
+ "step": 730
594
+ },
595
+ {
596
+ "epoch": 0.7167070217917676,
597
+ "grad_norm": 2.3590221405029297,
598
+ "learning_rate": 2.596615237066535e-05,
599
+ "loss": 0.1063,
600
+ "num_input_tokens_seen": 1848896,
601
+ "step": 740
602
+ },
603
+ {
604
+ "epoch": 0.7263922518159807,
605
+ "grad_norm": 1.0496519804000854,
606
+ "learning_rate": 2.586173815565805e-05,
607
+ "loss": 0.1104,
608
+ "num_input_tokens_seen": 1873248,
609
+ "step": 750
610
+ },
611
+ {
612
+ "epoch": 0.7360774818401937,
613
+ "grad_norm": 1.513573408126831,
614
+ "learning_rate": 2.575620555021634e-05,
615
+ "loss": 0.1125,
616
+ "num_input_tokens_seen": 1897184,
617
+ "step": 760
618
+ },
619
+ {
620
+ "epoch": 0.7457627118644068,
621
+ "grad_norm": 1.5545728206634521,
622
+ "learning_rate": 2.564956542061732e-05,
623
+ "loss": 0.0969,
624
+ "num_input_tokens_seen": 1922368,
625
+ "step": 770
626
+ },
627
+ {
628
+ "epoch": 0.7554479418886199,
629
+ "grad_norm": 1.9260263442993164,
630
+ "learning_rate": 2.5541828747175477e-05,
631
+ "loss": 0.1142,
632
+ "num_input_tokens_seen": 1947904,
633
+ "step": 780
634
+ },
635
+ {
636
+ "epoch": 0.7651331719128329,
637
+ "grad_norm": 2.396538734436035,
638
+ "learning_rate": 2.543300662311211e-05,
639
+ "loss": 0.0926,
640
+ "num_input_tokens_seen": 1971872,
641
+ "step": 790
642
+ },
643
+ {
644
+ "epoch": 0.774818401937046,
645
+ "grad_norm": 1.7069965600967407,
646
+ "learning_rate": 2.532311025341309e-05,
647
+ "loss": 0.0802,
648
+ "num_input_tokens_seen": 1996352,
649
+ "step": 800
650
+ },
651
+ {
652
+ "epoch": 0.784503631961259,
653
+ "grad_norm": 5.540910243988037,
654
+ "learning_rate": 2.5212150953675133e-05,
655
+ "loss": 0.1248,
656
+ "num_input_tokens_seen": 2020480,
657
+ "step": 810
658
+ },
659
+ {
660
+ "epoch": 0.7941888619854721,
661
+ "grad_norm": 1.7795952558517456,
662
+ "learning_rate": 2.5100140148940688e-05,
663
+ "loss": 0.0767,
664
+ "num_input_tokens_seen": 2044448,
665
+ "step": 820
666
+ },
667
+ {
668
+ "epoch": 0.8038740920096852,
669
+ "grad_norm": 2.7387983798980713,
670
+ "learning_rate": 2.498708937252153e-05,
671
+ "loss": 0.1239,
672
+ "num_input_tokens_seen": 2070400,
673
+ "step": 830
674
+ },
675
+ {
676
+ "epoch": 0.8135593220338984,
677
+ "grad_norm": 2.1243462562561035,
678
+ "learning_rate": 2.4873010264811222e-05,
679
+ "loss": 0.108,
680
+ "num_input_tokens_seen": 2095392,
681
+ "step": 840
682
+ },
683
+ {
684
+ "epoch": 0.8232445520581114,
685
+ "grad_norm": 0.9928631782531738,
686
+ "learning_rate": 2.4757914572086555e-05,
687
+ "loss": 0.0994,
688
+ "num_input_tokens_seen": 2120192,
689
+ "step": 850
690
+ },
691
+ {
692
+ "epoch": 0.8329297820823245,
693
+ "grad_norm": 6.047460556030273,
694
+ "learning_rate": 2.464181414529809e-05,
695
+ "loss": 0.0927,
696
+ "num_input_tokens_seen": 2144384,
697
+ "step": 860
698
+ },
699
+ {
700
+ "epoch": 0.8426150121065376,
701
+ "grad_norm": 2.2197115421295166,
702
+ "learning_rate": 2.4524720938849883e-05,
703
+ "loss": 0.1328,
704
+ "num_input_tokens_seen": 2168704,
705
+ "step": 870
706
+ },
707
+ {
708
+ "epoch": 0.8523002421307506,
709
+ "grad_norm": 2.0752601623535156,
710
+ "learning_rate": 2.440664700936861e-05,
711
+ "loss": 0.1229,
712
+ "num_input_tokens_seen": 2193248,
713
+ "step": 880
714
+ },
715
+ {
716
+ "epoch": 0.8619854721549637,
717
+ "grad_norm": 1.00425386428833,
718
+ "learning_rate": 2.4287604514462152e-05,
719
+ "loss": 0.0957,
720
+ "num_input_tokens_seen": 2217568,
721
+ "step": 890
722
+ },
723
+ {
724
+ "epoch": 0.8716707021791767,
725
+ "grad_norm": 1.9153094291687012,
726
+ "learning_rate": 2.416760571146774e-05,
727
+ "loss": 0.0975,
728
+ "num_input_tokens_seen": 2242048,
729
+ "step": 900
730
+ },
731
+ {
732
+ "epoch": 0.8813559322033898,
733
+ "grad_norm": 2.3558013439178467,
734
+ "learning_rate": 2.4046662956189898e-05,
735
+ "loss": 0.1068,
736
+ "num_input_tokens_seen": 2266112,
737
+ "step": 910
738
+ },
739
+ {
740
+ "epoch": 0.8910411622276029,
741
+ "grad_norm": 2.546351909637451,
742
+ "learning_rate": 2.3924788701628197e-05,
743
+ "loss": 0.0688,
744
+ "num_input_tokens_seen": 2290720,
745
+ "step": 920
746
+ },
747
+ {
748
+ "epoch": 0.9007263922518159,
749
+ "grad_norm": 1.2526168823242188,
750
+ "learning_rate": 2.3801995496695028e-05,
751
+ "loss": 0.1141,
752
+ "num_input_tokens_seen": 2315488,
753
+ "step": 930
754
+ },
755
+ {
756
+ "epoch": 0.910411622276029,
757
+ "grad_norm": 2.134089231491089,
758
+ "learning_rate": 2.367829598492348e-05,
759
+ "loss": 0.1328,
760
+ "num_input_tokens_seen": 2340992,
761
+ "step": 940
762
+ },
763
+ {
764
+ "epoch": 0.9200968523002422,
765
+ "grad_norm": 1.332915186882019,
766
+ "learning_rate": 2.3553702903165502e-05,
767
+ "loss": 0.1,
768
+ "num_input_tokens_seen": 2366880,
769
+ "step": 950
770
+ },
771
+ {
772
+ "epoch": 0.9297820823244553,
773
+ "grad_norm": 1.5140970945358276,
774
+ "learning_rate": 2.3428229080280407e-05,
775
+ "loss": 0.1089,
776
+ "num_input_tokens_seen": 2392000,
777
+ "step": 960
778
+ },
779
+ {
780
+ "epoch": 0.9394673123486683,
781
+ "grad_norm": 1.531954288482666,
782
+ "learning_rate": 2.330188743581398e-05,
783
+ "loss": 0.0924,
784
+ "num_input_tokens_seen": 2417472,
785
+ "step": 970
786
+ },
787
+ {
788
+ "epoch": 0.9491525423728814,
789
+ "grad_norm": 1.3347736597061157,
790
+ "learning_rate": 2.3174690978668155e-05,
791
+ "loss": 0.1205,
792
+ "num_input_tokens_seen": 2442496,
793
+ "step": 980
794
+ },
795
+ {
796
+ "epoch": 0.9588377723970944,
797
+ "grad_norm": 3.1497702598571777,
798
+ "learning_rate": 2.3046652805761588e-05,
799
+ "loss": 0.1004,
800
+ "num_input_tokens_seen": 2467392,
801
+ "step": 990
802
+ },
803
+ {
804
+ "epoch": 0.9685230024213075,
805
+ "grad_norm": 1.6756023168563843,
806
+ "learning_rate": 2.2917786100681078e-05,
807
+ "loss": 0.1007,
808
+ "num_input_tokens_seen": 2492768,
809
+ "step": 1000
810
+ }
811
+ ],
812
+ "logging_steps": 10,
813
+ "max_steps": 3096,
814
+ "num_input_tokens_seen": 2492768,
815
+ "num_train_epochs": 3,
816
+ "save_steps": 1000,
817
+ "stateful_callbacks": {
818
+ "TrainerControl": {
819
+ "args": {
820
+ "should_epoch_stop": false,
821
+ "should_evaluate": false,
822
+ "should_log": false,
823
+ "should_save": true,
824
+ "should_training_stop": false
825
+ },
826
+ "attributes": {}
827
+ }
828
+ },
829
+ "total_flos": 1.0671206790148915e+17,
830
+ "train_batch_size": 4,
831
+ "trial_name": null,
832
+ "trial_params": null
833
+ }
checkpoint-1000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de7a6d9a1a05e78971782a6ee6bb88dfb9617ab9f9e2f35984cd80b0711875f6
3
+ size 5688
checkpoint-2000/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: unsloth/mistral-7b-instruct-v0.3-bnb-4bit
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.12.0
checkpoint-2000/adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 8,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "up_proj",
24
+ "gate_proj",
25
+ "q_proj",
26
+ "down_proj",
27
+ "v_proj",
28
+ "k_proj",
29
+ "o_proj"
30
+ ],
31
+ "task_type": "CAUSAL_LM",
32
+ "use_dora": false,
33
+ "use_rslora": false
34
+ }
checkpoint-2000/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d8c6480a2e32e03f92e2fccc389b6cf9b5355f580066f3c1b39cac5d652930d
3
+ size 83945296
checkpoint-2000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df64ffde1d99ec422676939a623694193894f5fdea9b1069a5b9c7072a1f107c
3
+ size 168149074
checkpoint-2000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ff264f99d31b522cc7e2a4eac9d38606d0c58a34c0adc74d71e0ca8b371dc36
3
+ size 14244
checkpoint-2000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6729b4eb02aeb2fb6d654b8bca00ebe8fe8c29edfd4ed50961ee70773f756c3b
3
+ size 1064
checkpoint-2000/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
checkpoint-2000/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-2000/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37f00374dea48658ee8f5d0f21895b9bc55cb0103939607c8185bfd1c6ca1f89
3
+ size 587404
checkpoint-2000/tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-2000/trainer_state.json ADDED
@@ -0,0 +1,1633 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.9365617433414044,
5
+ "eval_steps": 500,
6
+ "global_step": 2000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.009685230024213076,
13
+ "grad_norm": 6.778852939605713,
14
+ "learning_rate": 2.9999227754514262e-05,
15
+ "loss": 0.8519,
16
+ "num_input_tokens_seen": 25568,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 0.01937046004842615,
21
+ "grad_norm": 3.0029561519622803,
22
+ "learning_rate": 2.9996911097572118e-05,
23
+ "loss": 0.189,
24
+ "num_input_tokens_seen": 51072,
25
+ "step": 20
26
+ },
27
+ {
28
+ "epoch": 0.029055690072639227,
29
+ "grad_norm": 5.477710247039795,
30
+ "learning_rate": 2.9993050267710624e-05,
31
+ "loss": 0.1648,
32
+ "num_input_tokens_seen": 76416,
33
+ "step": 30
34
+ },
35
+ {
36
+ "epoch": 0.0387409200968523,
37
+ "grad_norm": 4.35634183883667,
38
+ "learning_rate": 2.9987645662464235e-05,
39
+ "loss": 0.1905,
40
+ "num_input_tokens_seen": 101344,
41
+ "step": 40
42
+ },
43
+ {
44
+ "epoch": 0.048426150121065374,
45
+ "grad_norm": 4.523565292358398,
46
+ "learning_rate": 2.9980697838323884e-05,
47
+ "loss": 0.1794,
48
+ "num_input_tokens_seen": 126656,
49
+ "step": 50
50
+ },
51
+ {
52
+ "epoch": 0.05811138014527845,
53
+ "grad_norm": 1.9348187446594238,
54
+ "learning_rate": 2.9972207510679677e-05,
55
+ "loss": 0.1528,
56
+ "num_input_tokens_seen": 151200,
57
+ "step": 60
58
+ },
59
+ {
60
+ "epoch": 0.06779661016949153,
61
+ "grad_norm": 2.981433629989624,
62
+ "learning_rate": 2.996217555374725e-05,
63
+ "loss": 0.1742,
64
+ "num_input_tokens_seen": 175968,
65
+ "step": 70
66
+ },
67
+ {
68
+ "epoch": 0.0774818401937046,
69
+ "grad_norm": 3.6294591426849365,
70
+ "learning_rate": 2.9950603000477722e-05,
71
+ "loss": 0.1565,
72
+ "num_input_tokens_seen": 201280,
73
+ "step": 80
74
+ },
75
+ {
76
+ "epoch": 0.08716707021791767,
77
+ "grad_norm": 2.5459301471710205,
78
+ "learning_rate": 2.993749104245137e-05,
79
+ "loss": 0.1499,
80
+ "num_input_tokens_seen": 226432,
81
+ "step": 90
82
+ },
83
+ {
84
+ "epoch": 0.09685230024213075,
85
+ "grad_norm": 2.2721059322357178,
86
+ "learning_rate": 2.992284102975491e-05,
87
+ "loss": 0.1441,
88
+ "num_input_tokens_seen": 251744,
89
+ "step": 100
90
+ },
91
+ {
92
+ "epoch": 0.10653753026634383,
93
+ "grad_norm": 2.0033624172210693,
94
+ "learning_rate": 2.9906654470842492e-05,
95
+ "loss": 0.1245,
96
+ "num_input_tokens_seen": 276480,
97
+ "step": 110
98
+ },
99
+ {
100
+ "epoch": 0.1162227602905569,
101
+ "grad_norm": 8.585118293762207,
102
+ "learning_rate": 2.9888933032380397e-05,
103
+ "loss": 0.1333,
104
+ "num_input_tokens_seen": 301664,
105
+ "step": 120
106
+ },
107
+ {
108
+ "epoch": 0.12590799031476999,
109
+ "grad_norm": 1.423967719078064,
110
+ "learning_rate": 2.9869678539075403e-05,
111
+ "loss": 0.1728,
112
+ "num_input_tokens_seen": 326784,
113
+ "step": 130
114
+ },
115
+ {
116
+ "epoch": 0.13559322033898305,
117
+ "grad_norm": 2.6306211948394775,
118
+ "learning_rate": 2.9848892973486912e-05,
119
+ "loss": 0.1281,
120
+ "num_input_tokens_seen": 351328,
121
+ "step": 140
122
+ },
123
+ {
124
+ "epoch": 0.14527845036319612,
125
+ "grad_norm": 2.5618090629577637,
126
+ "learning_rate": 2.9826578475822825e-05,
127
+ "loss": 0.1136,
128
+ "num_input_tokens_seen": 376000,
129
+ "step": 150
130
+ },
131
+ {
132
+ "epoch": 0.1549636803874092,
133
+ "grad_norm": 2.694077730178833,
134
+ "learning_rate": 2.980273734371914e-05,
135
+ "loss": 0.1277,
136
+ "num_input_tokens_seen": 400384,
137
+ "step": 160
138
+ },
139
+ {
140
+ "epoch": 0.16464891041162227,
141
+ "grad_norm": 2.632338047027588,
142
+ "learning_rate": 2.9777372032003423e-05,
143
+ "loss": 0.1028,
144
+ "num_input_tokens_seen": 426432,
145
+ "step": 170
146
+ },
147
+ {
148
+ "epoch": 0.17433414043583534,
149
+ "grad_norm": 2.3446829319000244,
150
+ "learning_rate": 2.975048515244199e-05,
151
+ "loss": 0.1245,
152
+ "num_input_tokens_seen": 451712,
153
+ "step": 180
154
+ },
155
+ {
156
+ "epoch": 0.18401937046004843,
157
+ "grad_norm": 1.8457319736480713,
158
+ "learning_rate": 2.9722079473471035e-05,
159
+ "loss": 0.142,
160
+ "num_input_tokens_seen": 476960,
161
+ "step": 190
162
+ },
163
+ {
164
+ "epoch": 0.1937046004842615,
165
+ "grad_norm": 1.8676010370254517,
166
+ "learning_rate": 2.9692157919911536e-05,
167
+ "loss": 0.1342,
168
+ "num_input_tokens_seen": 501440,
169
+ "step": 200
170
+ },
171
+ {
172
+ "epoch": 0.2033898305084746,
173
+ "grad_norm": 4.593673229217529,
174
+ "learning_rate": 2.966072357266811e-05,
175
+ "loss": 0.1314,
176
+ "num_input_tokens_seen": 526656,
177
+ "step": 210
178
+ },
179
+ {
180
+ "epoch": 0.21307506053268765,
181
+ "grad_norm": 3.9568676948547363,
182
+ "learning_rate": 2.9627779668411795e-05,
183
+ "loss": 0.171,
184
+ "num_input_tokens_seen": 552544,
185
+ "step": 220
186
+ },
187
+ {
188
+ "epoch": 0.22276029055690072,
189
+ "grad_norm": 2.4331846237182617,
190
+ "learning_rate": 2.9593329599246766e-05,
191
+ "loss": 0.115,
192
+ "num_input_tokens_seen": 577472,
193
+ "step": 230
194
+ },
195
+ {
196
+ "epoch": 0.2324455205811138,
197
+ "grad_norm": 2.525543212890625,
198
+ "learning_rate": 2.955737691236108e-05,
199
+ "loss": 0.1158,
200
+ "num_input_tokens_seen": 601856,
201
+ "step": 240
202
+ },
203
+ {
204
+ "epoch": 0.24213075060532688,
205
+ "grad_norm": 2.2355105876922607,
206
+ "learning_rate": 2.9519925309661422e-05,
207
+ "loss": 0.111,
208
+ "num_input_tokens_seen": 627904,
209
+ "step": 250
210
+ },
211
+ {
212
+ "epoch": 0.25181598062953997,
213
+ "grad_norm": 4.165389537811279,
214
+ "learning_rate": 2.948097864739194e-05,
215
+ "loss": 0.1314,
216
+ "num_input_tokens_seen": 651936,
217
+ "step": 260
218
+ },
219
+ {
220
+ "epoch": 0.26150121065375304,
221
+ "grad_norm": 3.1712851524353027,
222
+ "learning_rate": 2.944054093573719e-05,
223
+ "loss": 0.143,
224
+ "num_input_tokens_seen": 676416,
225
+ "step": 270
226
+ },
227
+ {
228
+ "epoch": 0.2711864406779661,
229
+ "grad_norm": 2.881716728210449,
230
+ "learning_rate": 2.93986163384092e-05,
231
+ "loss": 0.1121,
232
+ "num_input_tokens_seen": 700832,
233
+ "step": 280
234
+ },
235
+ {
236
+ "epoch": 0.28087167070217917,
237
+ "grad_norm": 3.060872793197632,
238
+ "learning_rate": 2.9355209172218777e-05,
239
+ "loss": 0.1159,
240
+ "num_input_tokens_seen": 725824,
241
+ "step": 290
242
+ },
243
+ {
244
+ "epoch": 0.29055690072639223,
245
+ "grad_norm": 4.449444770812988,
246
+ "learning_rate": 2.931032390663101e-05,
247
+ "loss": 0.133,
248
+ "num_input_tokens_seen": 749408,
249
+ "step": 300
250
+ },
251
+ {
252
+ "epoch": 0.30024213075060535,
253
+ "grad_norm": 5.323568344116211,
254
+ "learning_rate": 2.926396516330506e-05,
255
+ "loss": 0.1172,
256
+ "num_input_tokens_seen": 773984,
257
+ "step": 310
258
+ },
259
+ {
260
+ "epoch": 0.3099273607748184,
261
+ "grad_norm": 3.144500732421875,
262
+ "learning_rate": 2.921613771561829e-05,
263
+ "loss": 0.136,
264
+ "num_input_tokens_seen": 799168,
265
+ "step": 320
266
+ },
267
+ {
268
+ "epoch": 0.3196125907990315,
269
+ "grad_norm": 2.433586359024048,
270
+ "learning_rate": 2.916684648817478e-05,
271
+ "loss": 0.0973,
272
+ "num_input_tokens_seen": 824320,
273
+ "step": 330
274
+ },
275
+ {
276
+ "epoch": 0.32929782082324455,
277
+ "grad_norm": 3.349472761154175,
278
+ "learning_rate": 2.9116096556298256e-05,
279
+ "loss": 0.13,
280
+ "num_input_tokens_seen": 849632,
281
+ "step": 340
282
+ },
283
+ {
284
+ "epoch": 0.3389830508474576,
285
+ "grad_norm": 1.8927061557769775,
286
+ "learning_rate": 2.9063893145509475e-05,
287
+ "loss": 0.1257,
288
+ "num_input_tokens_seen": 874400,
289
+ "step": 350
290
+ },
291
+ {
292
+ "epoch": 0.3486682808716707,
293
+ "grad_norm": 3.972686529159546,
294
+ "learning_rate": 2.901024163098822e-05,
295
+ "loss": 0.1155,
296
+ "num_input_tokens_seen": 899264,
297
+ "step": 360
298
+ },
299
+ {
300
+ "epoch": 0.3583535108958838,
301
+ "grad_norm": 1.177282452583313,
302
+ "learning_rate": 2.8955147537019815e-05,
303
+ "loss": 0.1251,
304
+ "num_input_tokens_seen": 924544,
305
+ "step": 370
306
+ },
307
+ {
308
+ "epoch": 0.36803874092009686,
309
+ "grad_norm": 1.9911576509475708,
310
+ "learning_rate": 2.88986165364263e-05,
311
+ "loss": 0.1147,
312
+ "num_input_tokens_seen": 949792,
313
+ "step": 380
314
+ },
315
+ {
316
+ "epoch": 0.37772397094430993,
317
+ "grad_norm": 2.402615785598755,
318
+ "learning_rate": 2.8840654449982344e-05,
319
+ "loss": 0.1433,
320
+ "num_input_tokens_seen": 974112,
321
+ "step": 390
322
+ },
323
+ {
324
+ "epoch": 0.387409200968523,
325
+ "grad_norm": 1.3184998035430908,
326
+ "learning_rate": 2.8781267245815898e-05,
327
+ "loss": 0.1117,
328
+ "num_input_tokens_seen": 999168,
329
+ "step": 400
330
+ },
331
+ {
332
+ "epoch": 0.39709443099273606,
333
+ "grad_norm": 1.9284625053405762,
334
+ "learning_rate": 2.8720461038793672e-05,
335
+ "loss": 0.1353,
336
+ "num_input_tokens_seen": 1024320,
337
+ "step": 410
338
+ },
339
+ {
340
+ "epoch": 0.4067796610169492,
341
+ "grad_norm": 3.1020259857177734,
342
+ "learning_rate": 2.8658242089891515e-05,
343
+ "loss": 0.1165,
344
+ "num_input_tokens_seen": 1049088,
345
+ "step": 420
346
+ },
347
+ {
348
+ "epoch": 0.41646489104116224,
349
+ "grad_norm": 2.203179359436035,
350
+ "learning_rate": 2.8594616805549752e-05,
351
+ "loss": 0.1215,
352
+ "num_input_tokens_seen": 1073632,
353
+ "step": 430
354
+ },
355
+ {
356
+ "epoch": 0.4261501210653753,
357
+ "grad_norm": 2.053194522857666,
358
+ "learning_rate": 2.8529591737013526e-05,
359
+ "loss": 0.1066,
360
+ "num_input_tokens_seen": 1098208,
361
+ "step": 440
362
+ },
363
+ {
364
+ "epoch": 0.4358353510895884,
365
+ "grad_norm": 2.780935049057007,
366
+ "learning_rate": 2.8463173579658258e-05,
367
+ "loss": 0.0879,
368
+ "num_input_tokens_seen": 1122336,
369
+ "step": 450
370
+ },
371
+ {
372
+ "epoch": 0.44552058111380144,
373
+ "grad_norm": 1.9929611682891846,
374
+ "learning_rate": 2.8395369172300235e-05,
375
+ "loss": 0.1141,
376
+ "num_input_tokens_seen": 1147392,
377
+ "step": 460
378
+ },
379
+ {
380
+ "epoch": 0.4552058111380145,
381
+ "grad_norm": 1.1469779014587402,
382
+ "learning_rate": 2.8326185496492464e-05,
383
+ "loss": 0.1052,
384
+ "num_input_tokens_seen": 1173248,
385
+ "step": 470
386
+ },
387
+ {
388
+ "epoch": 0.4648910411622276,
389
+ "grad_norm": 2.501117706298828,
390
+ "learning_rate": 2.825562967580579e-05,
391
+ "loss": 0.1086,
392
+ "num_input_tokens_seen": 1197984,
393
+ "step": 480
394
+ },
395
+ {
396
+ "epoch": 0.4745762711864407,
397
+ "grad_norm": 2.0266308784484863,
398
+ "learning_rate": 2.8183708975095406e-05,
399
+ "loss": 0.1201,
400
+ "num_input_tokens_seen": 1222720,
401
+ "step": 490
402
+ },
403
+ {
404
+ "epoch": 0.48426150121065376,
405
+ "grad_norm": 1.1120251417160034,
406
+ "learning_rate": 2.8110430799752845e-05,
407
+ "loss": 0.1319,
408
+ "num_input_tokens_seen": 1247232,
409
+ "step": 500
410
+ },
411
+ {
412
+ "epoch": 0.4939467312348668,
413
+ "grad_norm": 1.2014496326446533,
414
+ "learning_rate": 2.8035802694943457e-05,
415
+ "loss": 0.1071,
416
+ "num_input_tokens_seen": 1273184,
417
+ "step": 510
418
+ },
419
+ {
420
+ "epoch": 0.5036319612590799,
421
+ "grad_norm": 1.1245245933532715,
422
+ "learning_rate": 2.7959832344829512e-05,
423
+ "loss": 0.1554,
424
+ "num_input_tokens_seen": 1298688,
425
+ "step": 520
426
+ },
427
+ {
428
+ "epoch": 0.513317191283293,
429
+ "grad_norm": 2.031115770339966,
430
+ "learning_rate": 2.7882527571779003e-05,
431
+ "loss": 0.1196,
432
+ "num_input_tokens_seen": 1324128,
433
+ "step": 530
434
+ },
435
+ {
436
+ "epoch": 0.5230024213075061,
437
+ "grad_norm": 1.7691289186477661,
438
+ "learning_rate": 2.78038963355602e-05,
439
+ "loss": 0.1334,
440
+ "num_input_tokens_seen": 1349120,
441
+ "step": 540
442
+ },
443
+ {
444
+ "epoch": 0.5326876513317191,
445
+ "grad_norm": 2.9496989250183105,
446
+ "learning_rate": 2.7723946732522055e-05,
447
+ "loss": 0.1109,
448
+ "num_input_tokens_seen": 1374304,
449
+ "step": 550
450
+ },
451
+ {
452
+ "epoch": 0.5423728813559322,
453
+ "grad_norm": 2.2881715297698975,
454
+ "learning_rate": 2.764268699476058e-05,
455
+ "loss": 0.1274,
456
+ "num_input_tokens_seen": 1399136,
457
+ "step": 560
458
+ },
459
+ {
460
+ "epoch": 0.5520581113801453,
461
+ "grad_norm": 1.9754095077514648,
462
+ "learning_rate": 2.756012548927119e-05,
463
+ "loss": 0.1397,
464
+ "num_input_tokens_seen": 1424672,
465
+ "step": 570
466
+ },
467
+ {
468
+ "epoch": 0.5617433414043583,
469
+ "grad_norm": 1.9883428812026978,
470
+ "learning_rate": 2.7476270717087215e-05,
471
+ "loss": 0.101,
472
+ "num_input_tokens_seen": 1449024,
473
+ "step": 580
474
+ },
475
+ {
476
+ "epoch": 0.5714285714285714,
477
+ "grad_norm": 0.9653130769729614,
478
+ "learning_rate": 2.7391131312404556e-05,
479
+ "loss": 0.0941,
480
+ "num_input_tokens_seen": 1475264,
481
+ "step": 590
482
+ },
483
+ {
484
+ "epoch": 0.5811138014527845,
485
+ "grad_norm": 4.576601028442383,
486
+ "learning_rate": 2.7304716041692663e-05,
487
+ "loss": 0.0865,
488
+ "num_input_tokens_seen": 1500064,
489
+ "step": 600
490
+ },
491
+ {
492
+ "epoch": 0.5907990314769975,
493
+ "grad_norm": 2.4046311378479004,
494
+ "learning_rate": 2.7217033802791906e-05,
495
+ "loss": 0.1596,
496
+ "num_input_tokens_seen": 1524448,
497
+ "step": 610
498
+ },
499
+ {
500
+ "epoch": 0.6004842615012107,
501
+ "grad_norm": 1.7785555124282837,
502
+ "learning_rate": 2.7128093623997368e-05,
503
+ "loss": 0.0891,
504
+ "num_input_tokens_seen": 1549536,
505
+ "step": 620
506
+ },
507
+ {
508
+ "epoch": 0.6101694915254238,
509
+ "grad_norm": 2.2736170291900635,
510
+ "learning_rate": 2.7037904663129262e-05,
511
+ "loss": 0.1085,
512
+ "num_input_tokens_seen": 1573408,
513
+ "step": 630
514
+ },
515
+ {
516
+ "epoch": 0.6198547215496368,
517
+ "grad_norm": 1.0862345695495605,
518
+ "learning_rate": 2.6946476206589972e-05,
519
+ "loss": 0.1023,
520
+ "num_input_tokens_seen": 1597888,
521
+ "step": 640
522
+ },
523
+ {
524
+ "epoch": 0.6295399515738499,
525
+ "grad_norm": 0.5358290672302246,
526
+ "learning_rate": 2.6853817668407875e-05,
527
+ "loss": 0.0669,
528
+ "num_input_tokens_seen": 1623296,
529
+ "step": 650
530
+ },
531
+ {
532
+ "epoch": 0.639225181598063,
533
+ "grad_norm": 2.3138749599456787,
534
+ "learning_rate": 2.6759938589268023e-05,
535
+ "loss": 0.1017,
536
+ "num_input_tokens_seen": 1649216,
537
+ "step": 660
538
+ },
539
+ {
540
+ "epoch": 0.648910411622276,
541
+ "grad_norm": 3.2054226398468018,
542
+ "learning_rate": 2.6664848635529742e-05,
543
+ "loss": 0.1432,
544
+ "num_input_tokens_seen": 1673760,
545
+ "step": 670
546
+ },
547
+ {
548
+ "epoch": 0.6585956416464891,
549
+ "grad_norm": 1.8352829217910767,
550
+ "learning_rate": 2.6568557598231385e-05,
551
+ "loss": 0.1081,
552
+ "num_input_tokens_seen": 1698592,
553
+ "step": 680
554
+ },
555
+ {
556
+ "epoch": 0.6682808716707022,
557
+ "grad_norm": 1.203284740447998,
558
+ "learning_rate": 2.6471075392082125e-05,
559
+ "loss": 0.1037,
560
+ "num_input_tokens_seen": 1723296,
561
+ "step": 690
562
+ },
563
+ {
564
+ "epoch": 0.6779661016949152,
565
+ "grad_norm": 1.635628581047058,
566
+ "learning_rate": 2.6372412054441116e-05,
567
+ "loss": 0.1216,
568
+ "num_input_tokens_seen": 1748384,
569
+ "step": 700
570
+ },
571
+ {
572
+ "epoch": 0.6876513317191283,
573
+ "grad_norm": 0.8993457555770874,
574
+ "learning_rate": 2.6272577744283965e-05,
575
+ "loss": 0.0853,
576
+ "num_input_tokens_seen": 1773600,
577
+ "step": 710
578
+ },
579
+ {
580
+ "epoch": 0.6973365617433414,
581
+ "grad_norm": 1.7306419610977173,
582
+ "learning_rate": 2.617158274115673e-05,
583
+ "loss": 0.1034,
584
+ "num_input_tokens_seen": 1798656,
585
+ "step": 720
586
+ },
587
+ {
588
+ "epoch": 0.7070217917675545,
589
+ "grad_norm": 2.770066976547241,
590
+ "learning_rate": 2.6069437444117432e-05,
591
+ "loss": 0.0872,
592
+ "num_input_tokens_seen": 1824544,
593
+ "step": 730
594
+ },
595
+ {
596
+ "epoch": 0.7167070217917676,
597
+ "grad_norm": 2.3590221405029297,
598
+ "learning_rate": 2.596615237066535e-05,
599
+ "loss": 0.1063,
600
+ "num_input_tokens_seen": 1848896,
601
+ "step": 740
602
+ },
603
+ {
604
+ "epoch": 0.7263922518159807,
605
+ "grad_norm": 1.0496519804000854,
606
+ "learning_rate": 2.586173815565805e-05,
607
+ "loss": 0.1104,
608
+ "num_input_tokens_seen": 1873248,
609
+ "step": 750
610
+ },
611
+ {
612
+ "epoch": 0.7360774818401937,
613
+ "grad_norm": 1.513573408126831,
614
+ "learning_rate": 2.575620555021634e-05,
615
+ "loss": 0.1125,
616
+ "num_input_tokens_seen": 1897184,
617
+ "step": 760
618
+ },
619
+ {
620
+ "epoch": 0.7457627118644068,
621
+ "grad_norm": 1.5545728206634521,
622
+ "learning_rate": 2.564956542061732e-05,
623
+ "loss": 0.0969,
624
+ "num_input_tokens_seen": 1922368,
625
+ "step": 770
626
+ },
627
+ {
628
+ "epoch": 0.7554479418886199,
629
+ "grad_norm": 1.9260263442993164,
630
+ "learning_rate": 2.5541828747175477e-05,
631
+ "loss": 0.1142,
632
+ "num_input_tokens_seen": 1947904,
633
+ "step": 780
634
+ },
635
+ {
636
+ "epoch": 0.7651331719128329,
637
+ "grad_norm": 2.396538734436035,
638
+ "learning_rate": 2.543300662311211e-05,
639
+ "loss": 0.0926,
640
+ "num_input_tokens_seen": 1971872,
641
+ "step": 790
642
+ },
643
+ {
644
+ "epoch": 0.774818401937046,
645
+ "grad_norm": 1.7069965600967407,
646
+ "learning_rate": 2.532311025341309e-05,
647
+ "loss": 0.0802,
648
+ "num_input_tokens_seen": 1996352,
649
+ "step": 800
650
+ },
651
+ {
652
+ "epoch": 0.784503631961259,
653
+ "grad_norm": 5.540910243988037,
654
+ "learning_rate": 2.5212150953675133e-05,
655
+ "loss": 0.1248,
656
+ "num_input_tokens_seen": 2020480,
657
+ "step": 810
658
+ },
659
+ {
660
+ "epoch": 0.7941888619854721,
661
+ "grad_norm": 1.7795952558517456,
662
+ "learning_rate": 2.5100140148940688e-05,
663
+ "loss": 0.0767,
664
+ "num_input_tokens_seen": 2044448,
665
+ "step": 820
666
+ },
667
+ {
668
+ "epoch": 0.8038740920096852,
669
+ "grad_norm": 2.7387983798980713,
670
+ "learning_rate": 2.498708937252153e-05,
671
+ "loss": 0.1239,
672
+ "num_input_tokens_seen": 2070400,
673
+ "step": 830
674
+ },
675
+ {
676
+ "epoch": 0.8135593220338984,
677
+ "grad_norm": 2.1243462562561035,
678
+ "learning_rate": 2.4873010264811222e-05,
679
+ "loss": 0.108,
680
+ "num_input_tokens_seen": 2095392,
681
+ "step": 840
682
+ },
683
+ {
684
+ "epoch": 0.8232445520581114,
685
+ "grad_norm": 0.9928631782531738,
686
+ "learning_rate": 2.4757914572086555e-05,
687
+ "loss": 0.0994,
688
+ "num_input_tokens_seen": 2120192,
689
+ "step": 850
690
+ },
691
+ {
692
+ "epoch": 0.8329297820823245,
693
+ "grad_norm": 6.047460556030273,
694
+ "learning_rate": 2.464181414529809e-05,
695
+ "loss": 0.0927,
696
+ "num_input_tokens_seen": 2144384,
697
+ "step": 860
698
+ },
699
+ {
700
+ "epoch": 0.8426150121065376,
701
+ "grad_norm": 2.2197115421295166,
702
+ "learning_rate": 2.4524720938849883e-05,
703
+ "loss": 0.1328,
704
+ "num_input_tokens_seen": 2168704,
705
+ "step": 870
706
+ },
707
+ {
708
+ "epoch": 0.8523002421307506,
709
+ "grad_norm": 2.0752601623535156,
710
+ "learning_rate": 2.440664700936861e-05,
711
+ "loss": 0.1229,
712
+ "num_input_tokens_seen": 2193248,
713
+ "step": 880
714
+ },
715
+ {
716
+ "epoch": 0.8619854721549637,
717
+ "grad_norm": 1.00425386428833,
718
+ "learning_rate": 2.4287604514462152e-05,
719
+ "loss": 0.0957,
720
+ "num_input_tokens_seen": 2217568,
721
+ "step": 890
722
+ },
723
+ {
724
+ "epoch": 0.8716707021791767,
725
+ "grad_norm": 1.9153094291687012,
726
+ "learning_rate": 2.416760571146774e-05,
727
+ "loss": 0.0975,
728
+ "num_input_tokens_seen": 2242048,
729
+ "step": 900
730
+ },
731
+ {
732
+ "epoch": 0.8813559322033898,
733
+ "grad_norm": 2.3558013439178467,
734
+ "learning_rate": 2.4046662956189898e-05,
735
+ "loss": 0.1068,
736
+ "num_input_tokens_seen": 2266112,
737
+ "step": 910
738
+ },
739
+ {
740
+ "epoch": 0.8910411622276029,
741
+ "grad_norm": 2.546351909637451,
742
+ "learning_rate": 2.3924788701628197e-05,
743
+ "loss": 0.0688,
744
+ "num_input_tokens_seen": 2290720,
745
+ "step": 920
746
+ },
747
+ {
748
+ "epoch": 0.9007263922518159,
749
+ "grad_norm": 1.2526168823242188,
750
+ "learning_rate": 2.3801995496695028e-05,
751
+ "loss": 0.1141,
752
+ "num_input_tokens_seen": 2315488,
753
+ "step": 930
754
+ },
755
+ {
756
+ "epoch": 0.910411622276029,
757
+ "grad_norm": 2.134089231491089,
758
+ "learning_rate": 2.367829598492348e-05,
759
+ "loss": 0.1328,
760
+ "num_input_tokens_seen": 2340992,
761
+ "step": 940
762
+ },
763
+ {
764
+ "epoch": 0.9200968523002422,
765
+ "grad_norm": 1.332915186882019,
766
+ "learning_rate": 2.3553702903165502e-05,
767
+ "loss": 0.1,
768
+ "num_input_tokens_seen": 2366880,
769
+ "step": 950
770
+ },
771
+ {
772
+ "epoch": 0.9297820823244553,
773
+ "grad_norm": 1.5140970945358276,
774
+ "learning_rate": 2.3428229080280407e-05,
775
+ "loss": 0.1089,
776
+ "num_input_tokens_seen": 2392000,
777
+ "step": 960
778
+ },
779
+ {
780
+ "epoch": 0.9394673123486683,
781
+ "grad_norm": 1.531954288482666,
782
+ "learning_rate": 2.330188743581398e-05,
783
+ "loss": 0.0924,
784
+ "num_input_tokens_seen": 2417472,
785
+ "step": 970
786
+ },
787
+ {
788
+ "epoch": 0.9491525423728814,
789
+ "grad_norm": 1.3347736597061157,
790
+ "learning_rate": 2.3174690978668155e-05,
791
+ "loss": 0.1205,
792
+ "num_input_tokens_seen": 2442496,
793
+ "step": 980
794
+ },
795
+ {
796
+ "epoch": 0.9588377723970944,
797
+ "grad_norm": 3.1497702598571777,
798
+ "learning_rate": 2.3046652805761588e-05,
799
+ "loss": 0.1004,
800
+ "num_input_tokens_seen": 2467392,
801
+ "step": 990
802
+ },
803
+ {
804
+ "epoch": 0.9685230024213075,
805
+ "grad_norm": 1.6756023168563843,
806
+ "learning_rate": 2.2917786100681078e-05,
807
+ "loss": 0.1007,
808
+ "num_input_tokens_seen": 2492768,
809
+ "step": 1000
810
+ },
811
+ {
812
+ "epoch": 0.9782082324455206,
813
+ "grad_norm": 2.56594181060791,
814
+ "learning_rate": 2.2788104132324125e-05,
815
+ "loss": 0.1179,
816
+ "num_input_tokens_seen": 2518176,
817
+ "step": 1010
818
+ },
819
+ {
820
+ "epoch": 0.9878934624697336,
821
+ "grad_norm": 2.1090595722198486,
822
+ "learning_rate": 2.2657620253532685e-05,
823
+ "loss": 0.0971,
824
+ "num_input_tokens_seen": 2543296,
825
+ "step": 1020
826
+ },
827
+ {
828
+ "epoch": 0.9975786924939467,
829
+ "grad_norm": 0.41959595680236816,
830
+ "learning_rate": 2.252634789971827e-05,
831
+ "loss": 0.0932,
832
+ "num_input_tokens_seen": 2567680,
833
+ "step": 1030
834
+ },
835
+ {
836
+ "epoch": 1.006779661016949,
837
+ "grad_norm": 1.6389803886413574,
838
+ "learning_rate": 2.2394300587478566e-05,
839
+ "loss": 0.0924,
840
+ "num_input_tokens_seen": 2591016,
841
+ "step": 1040
842
+ },
843
+ {
844
+ "epoch": 1.0164648910411622,
845
+ "grad_norm": 1.4045557975769043,
846
+ "learning_rate": 2.2261491913205684e-05,
847
+ "loss": 0.0985,
848
+ "num_input_tokens_seen": 2615752,
849
+ "step": 1050
850
+ },
851
+ {
852
+ "epoch": 1.0261501210653754,
853
+ "grad_norm": 2.0734925270080566,
854
+ "learning_rate": 2.212793555168617e-05,
855
+ "loss": 0.0853,
856
+ "num_input_tokens_seen": 2640200,
857
+ "step": 1060
858
+ },
859
+ {
860
+ "epoch": 1.0358353510895884,
861
+ "grad_norm": 2.1590147018432617,
862
+ "learning_rate": 2.1993645254692994e-05,
863
+ "loss": 0.116,
864
+ "num_input_tokens_seen": 2665416,
865
+ "step": 1070
866
+ },
867
+ {
868
+ "epoch": 1.0455205811138015,
869
+ "grad_norm": 1.739646553993225,
870
+ "learning_rate": 2.1858634849569578e-05,
871
+ "loss": 0.0972,
872
+ "num_input_tokens_seen": 2690376,
873
+ "step": 1080
874
+ },
875
+ {
876
+ "epoch": 1.0552058111380145,
877
+ "grad_norm": 0.6458954215049744,
878
+ "learning_rate": 2.1722918237806042e-05,
879
+ "loss": 0.0884,
880
+ "num_input_tokens_seen": 2715080,
881
+ "step": 1090
882
+ },
883
+ {
884
+ "epoch": 1.0648910411622277,
885
+ "grad_norm": 2.2830138206481934,
886
+ "learning_rate": 2.158650939360782e-05,
887
+ "loss": 0.073,
888
+ "num_input_tokens_seen": 2740424,
889
+ "step": 1100
890
+ },
891
+ {
892
+ "epoch": 1.0745762711864406,
893
+ "grad_norm": 1.5225194692611694,
894
+ "learning_rate": 2.1449422362456794e-05,
895
+ "loss": 0.0813,
896
+ "num_input_tokens_seen": 2765640,
897
+ "step": 1110
898
+ },
899
+ {
900
+ "epoch": 1.0842615012106538,
901
+ "grad_norm": 1.683604121208191,
902
+ "learning_rate": 2.13116712596651e-05,
903
+ "loss": 0.0953,
904
+ "num_input_tokens_seen": 2791176,
905
+ "step": 1120
906
+ },
907
+ {
908
+ "epoch": 1.0939467312348667,
909
+ "grad_norm": 1.5679166316986084,
910
+ "learning_rate": 2.1173270268921703e-05,
911
+ "loss": 0.0933,
912
+ "num_input_tokens_seen": 2816072,
913
+ "step": 1130
914
+ },
915
+ {
916
+ "epoch": 1.10363196125908,
917
+ "grad_norm": 1.3097947835922241,
918
+ "learning_rate": 2.1034233640831988e-05,
919
+ "loss": 0.0819,
920
+ "num_input_tokens_seen": 2840776,
921
+ "step": 1140
922
+ },
923
+ {
924
+ "epoch": 1.113317191283293,
925
+ "grad_norm": 0.5728388428688049,
926
+ "learning_rate": 2.0894575691450396e-05,
927
+ "loss": 0.0611,
928
+ "num_input_tokens_seen": 2865416,
929
+ "step": 1150
930
+ },
931
+ {
932
+ "epoch": 1.123002421307506,
933
+ "grad_norm": 2.3043558597564697,
934
+ "learning_rate": 2.0754310800806395e-05,
935
+ "loss": 0.0748,
936
+ "num_input_tokens_seen": 2890248,
937
+ "step": 1160
938
+ },
939
+ {
940
+ "epoch": 1.1326876513317192,
941
+ "grad_norm": 1.2087112665176392,
942
+ "learning_rate": 2.0613453411423797e-05,
943
+ "loss": 0.0959,
944
+ "num_input_tokens_seen": 2916392,
945
+ "step": 1170
946
+ },
947
+ {
948
+ "epoch": 1.1423728813559322,
949
+ "grad_norm": 1.5639240741729736,
950
+ "learning_rate": 2.0472018026833684e-05,
951
+ "loss": 0.0709,
952
+ "num_input_tokens_seen": 2941160,
953
+ "step": 1180
954
+ },
955
+ {
956
+ "epoch": 1.1520581113801454,
957
+ "grad_norm": 0.5889459848403931,
958
+ "learning_rate": 2.0330019210081022e-05,
959
+ "loss": 0.0731,
960
+ "num_input_tokens_seen": 2966120,
961
+ "step": 1190
962
+ },
963
+ {
964
+ "epoch": 1.1617433414043583,
965
+ "grad_norm": 1.854230523109436,
966
+ "learning_rate": 2.0187471582225173e-05,
967
+ "loss": 0.1005,
968
+ "num_input_tokens_seen": 2990088,
969
+ "step": 1200
970
+ },
971
+ {
972
+ "epoch": 1.1714285714285715,
973
+ "grad_norm": 2.01247239112854,
974
+ "learning_rate": 2.004438982083442e-05,
975
+ "loss": 0.0579,
976
+ "num_input_tokens_seen": 3015400,
977
+ "step": 1210
978
+ },
979
+ {
980
+ "epoch": 1.1811138014527844,
981
+ "grad_norm": 2.292900323867798,
982
+ "learning_rate": 1.9900788658474677e-05,
983
+ "loss": 0.0792,
984
+ "num_input_tokens_seen": 3039464,
985
+ "step": 1220
986
+ },
987
+ {
988
+ "epoch": 1.1907990314769976,
989
+ "grad_norm": 1.4194159507751465,
990
+ "learning_rate": 1.975668288119252e-05,
991
+ "loss": 0.057,
992
+ "num_input_tokens_seen": 3063816,
993
+ "step": 1230
994
+ },
995
+ {
996
+ "epoch": 1.2004842615012106,
997
+ "grad_norm": 1.0512489080429077,
998
+ "learning_rate": 1.961208732699275e-05,
999
+ "loss": 0.102,
1000
+ "num_input_tokens_seen": 3088968,
1001
+ "step": 1240
1002
+ },
1003
+ {
1004
+ "epoch": 1.2101694915254237,
1005
+ "grad_norm": 0.9465106129646301,
1006
+ "learning_rate": 1.9467016884310565e-05,
1007
+ "loss": 0.0691,
1008
+ "num_input_tokens_seen": 3113736,
1009
+ "step": 1250
1010
+ },
1011
+ {
1012
+ "epoch": 1.2198547215496367,
1013
+ "grad_norm": 1.274294376373291,
1014
+ "learning_rate": 1.9321486490478565e-05,
1015
+ "loss": 0.0668,
1016
+ "num_input_tokens_seen": 3138344,
1017
+ "step": 1260
1018
+ },
1019
+ {
1020
+ "epoch": 1.2295399515738499,
1021
+ "grad_norm": 1.9390579462051392,
1022
+ "learning_rate": 1.91755111301887e-05,
1023
+ "loss": 0.0711,
1024
+ "num_input_tokens_seen": 3163496,
1025
+ "step": 1270
1026
+ },
1027
+ {
1028
+ "epoch": 1.239225181598063,
1029
+ "grad_norm": 1.2855744361877441,
1030
+ "learning_rate": 1.902910583394938e-05,
1031
+ "loss": 0.0605,
1032
+ "num_input_tokens_seen": 3188392,
1033
+ "step": 1280
1034
+ },
1035
+ {
1036
+ "epoch": 1.248910411622276,
1037
+ "grad_norm": 2.931248188018799,
1038
+ "learning_rate": 1.888228567653781e-05,
1039
+ "loss": 0.0448,
1040
+ "num_input_tokens_seen": 3213224,
1041
+ "step": 1290
1042
+ },
1043
+ {
1044
+ "epoch": 1.2585956416464892,
1045
+ "grad_norm": 1.9991300106048584,
1046
+ "learning_rate": 1.873506577544784e-05,
1047
+ "loss": 0.0815,
1048
+ "num_input_tokens_seen": 3238568,
1049
+ "step": 1300
1050
+ },
1051
+ {
1052
+ "epoch": 1.2682808716707021,
1053
+ "grad_norm": 1.3530927896499634,
1054
+ "learning_rate": 1.8587461289333327e-05,
1055
+ "loss": 0.1043,
1056
+ "num_input_tokens_seen": 3264264,
1057
+ "step": 1310
1058
+ },
1059
+ {
1060
+ "epoch": 1.2779661016949153,
1061
+ "grad_norm": 2.07991099357605,
1062
+ "learning_rate": 1.8439487416447353e-05,
1063
+ "loss": 0.1037,
1064
+ "num_input_tokens_seen": 3288840,
1065
+ "step": 1320
1066
+ },
1067
+ {
1068
+ "epoch": 1.2876513317191283,
1069
+ "grad_norm": 1.8533947467803955,
1070
+ "learning_rate": 1.8291159393077294e-05,
1071
+ "loss": 0.0928,
1072
+ "num_input_tokens_seen": 3313832,
1073
+ "step": 1330
1074
+ },
1075
+ {
1076
+ "epoch": 1.2973365617433414,
1077
+ "grad_norm": 1.118119716644287,
1078
+ "learning_rate": 1.814249249197602e-05,
1079
+ "loss": 0.0775,
1080
+ "num_input_tokens_seen": 3337736,
1081
+ "step": 1340
1082
+ },
1083
+ {
1084
+ "epoch": 1.3070217917675544,
1085
+ "grad_norm": 2.740079641342163,
1086
+ "learning_rate": 1.7993502020789294e-05,
1087
+ "loss": 0.0521,
1088
+ "num_input_tokens_seen": 3362024,
1089
+ "step": 1350
1090
+ },
1091
+ {
1092
+ "epoch": 1.3167070217917676,
1093
+ "grad_norm": 1.9268351793289185,
1094
+ "learning_rate": 1.7844203320479614e-05,
1095
+ "loss": 0.0687,
1096
+ "num_input_tokens_seen": 3387496,
1097
+ "step": 1360
1098
+ },
1099
+ {
1100
+ "epoch": 1.3263922518159807,
1101
+ "grad_norm": 2.3576388359069824,
1102
+ "learning_rate": 1.7694611763746632e-05,
1103
+ "loss": 0.0704,
1104
+ "num_input_tokens_seen": 3412072,
1105
+ "step": 1370
1106
+ },
1107
+ {
1108
+ "epoch": 1.3360774818401937,
1109
+ "grad_norm": 1.127432942390442,
1110
+ "learning_rate": 1.754474275344427e-05,
1111
+ "loss": 0.0826,
1112
+ "num_input_tokens_seen": 3437096,
1113
+ "step": 1380
1114
+ },
1115
+ {
1116
+ "epoch": 1.3457627118644067,
1117
+ "grad_norm": 4.377537250518799,
1118
+ "learning_rate": 1.7394611720994747e-05,
1119
+ "loss": 0.0445,
1120
+ "num_input_tokens_seen": 3462120,
1121
+ "step": 1390
1122
+ },
1123
+ {
1124
+ "epoch": 1.3554479418886198,
1125
+ "grad_norm": 2.1285200119018555,
1126
+ "learning_rate": 1.724423412479967e-05,
1127
+ "loss": 0.0951,
1128
+ "num_input_tokens_seen": 3486952,
1129
+ "step": 1400
1130
+ },
1131
+ {
1132
+ "epoch": 1.365133171912833,
1133
+ "grad_norm": 0.16216270625591278,
1134
+ "learning_rate": 1.7093625448648348e-05,
1135
+ "loss": 0.0539,
1136
+ "num_input_tokens_seen": 3512264,
1137
+ "step": 1410
1138
+ },
1139
+ {
1140
+ "epoch": 1.374818401937046,
1141
+ "grad_norm": 2.1299915313720703,
1142
+ "learning_rate": 1.694280120012349e-05,
1143
+ "loss": 0.0848,
1144
+ "num_input_tokens_seen": 3537192,
1145
+ "step": 1420
1146
+ },
1147
+ {
1148
+ "epoch": 1.3845036319612591,
1149
+ "grad_norm": 2.476757049560547,
1150
+ "learning_rate": 1.6791776909004434e-05,
1151
+ "loss": 0.0629,
1152
+ "num_input_tokens_seen": 3560872,
1153
+ "step": 1430
1154
+ },
1155
+ {
1156
+ "epoch": 1.394188861985472,
1157
+ "grad_norm": 0.4373377561569214,
1158
+ "learning_rate": 1.664056812566812e-05,
1159
+ "loss": 0.079,
1160
+ "num_input_tokens_seen": 3586216,
1161
+ "step": 1440
1162
+ },
1163
+ {
1164
+ "epoch": 1.4038740920096853,
1165
+ "grad_norm": 1.9471170902252197,
1166
+ "learning_rate": 1.648919041948792e-05,
1167
+ "loss": 0.0798,
1168
+ "num_input_tokens_seen": 3610792,
1169
+ "step": 1450
1170
+ },
1171
+ {
1172
+ "epoch": 1.4135593220338982,
1173
+ "grad_norm": 2.911750316619873,
1174
+ "learning_rate": 1.6337659377230544e-05,
1175
+ "loss": 0.0897,
1176
+ "num_input_tokens_seen": 3634760,
1177
+ "step": 1460
1178
+ },
1179
+ {
1180
+ "epoch": 1.4232445520581114,
1181
+ "grad_norm": 2.9474802017211914,
1182
+ "learning_rate": 1.61859906014511e-05,
1183
+ "loss": 0.0858,
1184
+ "num_input_tokens_seen": 3659560,
1185
+ "step": 1470
1186
+ },
1187
+ {
1188
+ "epoch": 1.4329297820823244,
1189
+ "grad_norm": 0.6501768827438354,
1190
+ "learning_rate": 1.6034199708886573e-05,
1191
+ "loss": 0.0532,
1192
+ "num_input_tokens_seen": 3684840,
1193
+ "step": 1480
1194
+ },
1195
+ {
1196
+ "epoch": 1.4426150121065375,
1197
+ "grad_norm": 1.6708017587661743,
1198
+ "learning_rate": 1.5882302328847847e-05,
1199
+ "loss": 0.0842,
1200
+ "num_input_tokens_seen": 3709096,
1201
+ "step": 1490
1202
+ },
1203
+ {
1204
+ "epoch": 1.4523002421307507,
1205
+ "grad_norm": 1.5014967918395996,
1206
+ "learning_rate": 1.5730314101610376e-05,
1207
+ "loss": 0.0367,
1208
+ "num_input_tokens_seen": 3734728,
1209
+ "step": 1500
1210
+ },
1211
+ {
1212
+ "epoch": 1.4619854721549637,
1213
+ "grad_norm": 3.2587804794311523,
1214
+ "learning_rate": 1.5578250676803824e-05,
1215
+ "loss": 0.1085,
1216
+ "num_input_tokens_seen": 3758984,
1217
+ "step": 1510
1218
+ },
1219
+ {
1220
+ "epoch": 1.4716707021791768,
1221
+ "grad_norm": 6.304242134094238,
1222
+ "learning_rate": 1.5426127711800636e-05,
1223
+ "loss": 0.0712,
1224
+ "num_input_tokens_seen": 3784296,
1225
+ "step": 1520
1226
+ },
1227
+ {
1228
+ "epoch": 1.4813559322033898,
1229
+ "grad_norm": 1.1681016683578491,
1230
+ "learning_rate": 1.5273960870103872e-05,
1231
+ "loss": 0.0705,
1232
+ "num_input_tokens_seen": 3809768,
1233
+ "step": 1530
1234
+ },
1235
+ {
1236
+ "epoch": 1.491041162227603,
1237
+ "grad_norm": 1.111617922782898,
1238
+ "learning_rate": 1.5121765819734418e-05,
1239
+ "loss": 0.071,
1240
+ "num_input_tokens_seen": 3834536,
1241
+ "step": 1540
1242
+ },
1243
+ {
1244
+ "epoch": 1.5007263922518161,
1245
+ "grad_norm": 1.7780523300170898,
1246
+ "learning_rate": 1.4969558231617681e-05,
1247
+ "loss": 0.0648,
1248
+ "num_input_tokens_seen": 3858792,
1249
+ "step": 1550
1250
+ },
1251
+ {
1252
+ "epoch": 1.510411622276029,
1253
+ "grad_norm": 2.2017934322357178,
1254
+ "learning_rate": 1.4817353777970038e-05,
1255
+ "loss": 0.0633,
1256
+ "num_input_tokens_seen": 3883976,
1257
+ "step": 1560
1258
+ },
1259
+ {
1260
+ "epoch": 1.520096852300242,
1261
+ "grad_norm": 1.8567978143692017,
1262
+ "learning_rate": 1.466516813068512e-05,
1263
+ "loss": 0.0726,
1264
+ "num_input_tokens_seen": 3908392,
1265
+ "step": 1570
1266
+ },
1267
+ {
1268
+ "epoch": 1.5297820823244552,
1269
+ "grad_norm": 2.567291021347046,
1270
+ "learning_rate": 1.451301695972015e-05,
1271
+ "loss": 0.0882,
1272
+ "num_input_tokens_seen": 3932552,
1273
+ "step": 1580
1274
+ },
1275
+ {
1276
+ "epoch": 1.5394673123486684,
1277
+ "grad_norm": 1.9968935251235962,
1278
+ "learning_rate": 1.436091593148244e-05,
1279
+ "loss": 0.1149,
1280
+ "num_input_tokens_seen": 3957672,
1281
+ "step": 1590
1282
+ },
1283
+ {
1284
+ "epoch": 1.5491525423728814,
1285
+ "grad_norm": 1.9058917760849,
1286
+ "learning_rate": 1.4208880707216323e-05,
1287
+ "loss": 0.0841,
1288
+ "num_input_tokens_seen": 3982824,
1289
+ "step": 1600
1290
+ },
1291
+ {
1292
+ "epoch": 1.5588377723970943,
1293
+ "grad_norm": 1.9218000173568726,
1294
+ "learning_rate": 1.405692694139054e-05,
1295
+ "loss": 0.0896,
1296
+ "num_input_tokens_seen": 4008072,
1297
+ "step": 1610
1298
+ },
1299
+ {
1300
+ "epoch": 1.5685230024213075,
1301
+ "grad_norm": 1.5786553621292114,
1302
+ "learning_rate": 1.3905070280086387e-05,
1303
+ "loss": 0.0629,
1304
+ "num_input_tokens_seen": 4033096,
1305
+ "step": 1620
1306
+ },
1307
+ {
1308
+ "epoch": 1.5782082324455207,
1309
+ "grad_norm": 2.503990888595581,
1310
+ "learning_rate": 1.3753326359386695e-05,
1311
+ "loss": 0.077,
1312
+ "num_input_tokens_seen": 4058120,
1313
+ "step": 1630
1314
+ },
1315
+ {
1316
+ "epoch": 1.5878934624697336,
1317
+ "grad_norm": 1.5616143941879272,
1318
+ "learning_rate": 1.3601710803765814e-05,
1319
+ "loss": 0.0853,
1320
+ "num_input_tokens_seen": 4082792,
1321
+ "step": 1640
1322
+ },
1323
+ {
1324
+ "epoch": 1.5975786924939466,
1325
+ "grad_norm": 1.2533211708068848,
1326
+ "learning_rate": 1.3450239224480884e-05,
1327
+ "loss": 0.0605,
1328
+ "num_input_tokens_seen": 4107336,
1329
+ "step": 1650
1330
+ },
1331
+ {
1332
+ "epoch": 1.6072639225181597,
1333
+ "grad_norm": 1.1046490669250488,
1334
+ "learning_rate": 1.329892721796433e-05,
1335
+ "loss": 0.0985,
1336
+ "num_input_tokens_seen": 4132456,
1337
+ "step": 1660
1338
+ },
1339
+ {
1340
+ "epoch": 1.616949152542373,
1341
+ "grad_norm": 1.143494725227356,
1342
+ "learning_rate": 1.314779036421802e-05,
1343
+ "loss": 0.0547,
1344
+ "num_input_tokens_seen": 4156584,
1345
+ "step": 1670
1346
+ },
1347
+ {
1348
+ "epoch": 1.626634382566586,
1349
+ "grad_norm": 2.6082706451416016,
1350
+ "learning_rate": 1.2996844225209033e-05,
1351
+ "loss": 0.0919,
1352
+ "num_input_tokens_seen": 4181448,
1353
+ "step": 1680
1354
+ },
1355
+ {
1356
+ "epoch": 1.636319612590799,
1357
+ "grad_norm": 2.4191458225250244,
1358
+ "learning_rate": 1.2846104343267283e-05,
1359
+ "loss": 0.1204,
1360
+ "num_input_tokens_seen": 4207560,
1361
+ "step": 1690
1362
+ },
1363
+ {
1364
+ "epoch": 1.646004842615012,
1365
+ "grad_norm": 2.051799774169922,
1366
+ "learning_rate": 1.2695586239485223e-05,
1367
+ "loss": 0.0664,
1368
+ "num_input_tokens_seen": 4232040,
1369
+ "step": 1700
1370
+ },
1371
+ {
1372
+ "epoch": 1.6556900726392252,
1373
+ "grad_norm": 1.525844931602478,
1374
+ "learning_rate": 1.254530541211968e-05,
1375
+ "loss": 0.0805,
1376
+ "num_input_tokens_seen": 4257576,
1377
+ "step": 1710
1378
+ },
1379
+ {
1380
+ "epoch": 1.6653753026634384,
1381
+ "grad_norm": 0.9474373459815979,
1382
+ "learning_rate": 1.2395277334996045e-05,
1383
+ "loss": 0.073,
1384
+ "num_input_tokens_seen": 4282472,
1385
+ "step": 1720
1386
+ },
1387
+ {
1388
+ "epoch": 1.6750605326876513,
1389
+ "grad_norm": 1.8932424783706665,
1390
+ "learning_rate": 1.2245517455915036e-05,
1391
+ "loss": 0.0734,
1392
+ "num_input_tokens_seen": 4306792,
1393
+ "step": 1730
1394
+ },
1395
+ {
1396
+ "epoch": 1.6847457627118643,
1397
+ "grad_norm": 1.9888746738433838,
1398
+ "learning_rate": 1.2096041195062051e-05,
1399
+ "loss": 0.0831,
1400
+ "num_input_tokens_seen": 4333384,
1401
+ "step": 1740
1402
+ },
1403
+ {
1404
+ "epoch": 1.6944309927360774,
1405
+ "grad_norm": 1.8355742692947388,
1406
+ "learning_rate": 1.1946863943419452e-05,
1407
+ "loss": 0.0691,
1408
+ "num_input_tokens_seen": 4358344,
1409
+ "step": 1750
1410
+ },
1411
+ {
1412
+ "epoch": 1.7041162227602906,
1413
+ "grad_norm": 2.8447251319885254,
1414
+ "learning_rate": 1.1798001061181799e-05,
1415
+ "loss": 0.0988,
1416
+ "num_input_tokens_seen": 4381768,
1417
+ "step": 1760
1418
+ },
1419
+ {
1420
+ "epoch": 1.7138014527845038,
1421
+ "grad_norm": 2.670257806777954,
1422
+ "learning_rate": 1.1649467876174252e-05,
1423
+ "loss": 0.0936,
1424
+ "num_input_tokens_seen": 4405192,
1425
+ "step": 1770
1426
+ },
1427
+ {
1428
+ "epoch": 1.7234866828087168,
1429
+ "grad_norm": 1.188839077949524,
1430
+ "learning_rate": 1.1501279682274368e-05,
1431
+ "loss": 0.0901,
1432
+ "num_input_tokens_seen": 4430344,
1433
+ "step": 1780
1434
+ },
1435
+ {
1436
+ "epoch": 1.7331719128329297,
1437
+ "grad_norm": 2.494746685028076,
1438
+ "learning_rate": 1.1353451737837312e-05,
1439
+ "loss": 0.0691,
1440
+ "num_input_tokens_seen": 4455336,
1441
+ "step": 1790
1442
+ },
1443
+ {
1444
+ "epoch": 1.7428571428571429,
1445
+ "grad_norm": 1.3223942518234253,
1446
+ "learning_rate": 1.1205999264124788e-05,
1447
+ "loss": 0.0668,
1448
+ "num_input_tokens_seen": 4480648,
1449
+ "step": 1800
1450
+ },
1451
+ {
1452
+ "epoch": 1.752542372881356,
1453
+ "grad_norm": 1.3812003135681152,
1454
+ "learning_rate": 1.105893744373776e-05,
1455
+ "loss": 0.0788,
1456
+ "num_input_tokens_seen": 4506600,
1457
+ "step": 1810
1458
+ },
1459
+ {
1460
+ "epoch": 1.762227602905569,
1461
+ "grad_norm": 0.7805346250534058,
1462
+ "learning_rate": 1.0912281419053139e-05,
1463
+ "loss": 0.0723,
1464
+ "num_input_tokens_seen": 4531368,
1465
+ "step": 1820
1466
+ },
1467
+ {
1468
+ "epoch": 1.771912832929782,
1469
+ "grad_norm": 1.105878472328186,
1470
+ "learning_rate": 1.0766046290664662e-05,
1471
+ "loss": 0.0779,
1472
+ "num_input_tokens_seen": 4555272,
1473
+ "step": 1830
1474
+ },
1475
+ {
1476
+ "epoch": 1.7815980629539951,
1477
+ "grad_norm": 1.8672295808792114,
1478
+ "learning_rate": 1.0620247115828044e-05,
1479
+ "loss": 0.0838,
1480
+ "num_input_tokens_seen": 4580328,
1481
+ "step": 1840
1482
+ },
1483
+ {
1484
+ "epoch": 1.7912832929782083,
1485
+ "grad_norm": 1.844306468963623,
1486
+ "learning_rate": 1.047489890691055e-05,
1487
+ "loss": 0.0594,
1488
+ "num_input_tokens_seen": 4605768,
1489
+ "step": 1850
1490
+ },
1491
+ {
1492
+ "epoch": 1.8009685230024213,
1493
+ "grad_norm": 1.2717005014419556,
1494
+ "learning_rate": 1.0330016629845276e-05,
1495
+ "loss": 0.04,
1496
+ "num_input_tokens_seen": 4631048,
1497
+ "step": 1860
1498
+ },
1499
+ {
1500
+ "epoch": 1.8106537530266342,
1501
+ "grad_norm": 3.5843582153320312,
1502
+ "learning_rate": 1.0185615202590144e-05,
1503
+ "loss": 0.084,
1504
+ "num_input_tokens_seen": 4656456,
1505
+ "step": 1870
1506
+ },
1507
+ {
1508
+ "epoch": 1.8203389830508474,
1509
+ "grad_norm": 4.254288673400879,
1510
+ "learning_rate": 1.004170949359187e-05,
1511
+ "loss": 0.0654,
1512
+ "num_input_tokens_seen": 4681384,
1513
+ "step": 1880
1514
+ },
1515
+ {
1516
+ "epoch": 1.8300242130750606,
1517
+ "grad_norm": 1.351646065711975,
1518
+ "learning_rate": 9.89831432025501e-06,
1519
+ "loss": 0.0712,
1520
+ "num_input_tokens_seen": 4706216,
1521
+ "step": 1890
1522
+ },
1523
+ {
1524
+ "epoch": 1.8397094430992738,
1525
+ "grad_norm": 1.9015384912490845,
1526
+ "learning_rate": 9.755444447416255e-06,
1527
+ "loss": 0.0829,
1528
+ "num_input_tokens_seen": 4730984,
1529
+ "step": 1900
1530
+ },
1531
+ {
1532
+ "epoch": 1.8493946731234867,
1533
+ "grad_norm": 1.3803085088729858,
1534
+ "learning_rate": 9.613114585824196e-06,
1535
+ "loss": 0.0532,
1536
+ "num_input_tokens_seen": 4755112,
1537
+ "step": 1910
1538
+ },
1539
+ {
1540
+ "epoch": 1.8590799031476997,
1541
+ "grad_norm": 6.487275123596191,
1542
+ "learning_rate": 9.471339390624574e-06,
1543
+ "loss": 0.0781,
1544
+ "num_input_tokens_seen": 4780232,
1545
+ "step": 1920
1546
+ },
1547
+ {
1548
+ "epoch": 1.8687651331719128,
1549
+ "grad_norm": 2.182865619659424,
1550
+ "learning_rate": 9.330133459851323e-06,
1551
+ "loss": 0.0908,
1552
+ "num_input_tokens_seen": 4805192,
1553
+ "step": 1930
1554
+ },
1555
+ {
1556
+ "epoch": 1.878450363196126,
1557
+ "grad_norm": 0.42010384798049927,
1558
+ "learning_rate": 9.189511332923463e-06,
1559
+ "loss": 0.0398,
1560
+ "num_input_tokens_seen": 4830856,
1561
+ "step": 1940
1562
+ },
1563
+ {
1564
+ "epoch": 1.888135593220339,
1565
+ "grad_norm": 1.609157919883728,
1566
+ "learning_rate": 9.049487489148008e-06,
1567
+ "loss": 0.0912,
1568
+ "num_input_tokens_seen": 4855656,
1569
+ "step": 1950
1570
+ },
1571
+ {
1572
+ "epoch": 1.897820823244552,
1573
+ "grad_norm": 2.4291250705718994,
1574
+ "learning_rate": 8.910076346229134e-06,
1575
+ "loss": 0.0746,
1576
+ "num_input_tokens_seen": 4880392,
1577
+ "step": 1960
1578
+ },
1579
+ {
1580
+ "epoch": 1.907506053268765,
1581
+ "grad_norm": 2.243717670440674,
1582
+ "learning_rate": 8.77129225878361e-06,
1583
+ "loss": 0.1066,
1584
+ "num_input_tokens_seen": 4905320,
1585
+ "step": 1970
1586
+ },
1587
+ {
1588
+ "epoch": 1.9171912832929783,
1589
+ "grad_norm": 2.145559072494507,
1590
+ "learning_rate": 8.633149516862777e-06,
1591
+ "loss": 0.0839,
1592
+ "num_input_tokens_seen": 4930536,
1593
+ "step": 1980
1594
+ },
1595
+ {
1596
+ "epoch": 1.9268765133171912,
1597
+ "grad_norm": 0.6746326088905334,
1598
+ "learning_rate": 8.495662344481135e-06,
1599
+ "loss": 0.0527,
1600
+ "num_input_tokens_seen": 4956168,
1601
+ "step": 1990
1602
+ },
1603
+ {
1604
+ "epoch": 1.9365617433414044,
1605
+ "grad_norm": 1.293521761894226,
1606
+ "learning_rate": 8.358844898151791e-06,
1607
+ "loss": 0.1033,
1608
+ "num_input_tokens_seen": 4980584,
1609
+ "step": 2000
1610
+ }
1611
+ ],
1612
+ "logging_steps": 10,
1613
+ "max_steps": 3096,
1614
+ "num_input_tokens_seen": 4980584,
1615
+ "num_train_epochs": 3,
1616
+ "save_steps": 1000,
1617
+ "stateful_callbacks": {
1618
+ "TrainerControl": {
1619
+ "args": {
1620
+ "should_epoch_stop": false,
1621
+ "should_evaluate": false,
1622
+ "should_log": false,
1623
+ "should_save": true,
1624
+ "should_training_stop": false
1625
+ },
1626
+ "attributes": {}
1627
+ }
1628
+ },
1629
+ "total_flos": 2.1321214729853338e+17,
1630
+ "train_batch_size": 4,
1631
+ "trial_name": null,
1632
+ "trial_params": null
1633
+ }
checkpoint-2000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de7a6d9a1a05e78971782a6ee6bb88dfb9617ab9f9e2f35984cd80b0711875f6
3
+ size 5688
checkpoint-3000/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: unsloth/mistral-7b-instruct-v0.3-bnb-4bit
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.12.0
checkpoint-3000/adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 8,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "up_proj",
24
+ "gate_proj",
25
+ "q_proj",
26
+ "down_proj",
27
+ "v_proj",
28
+ "k_proj",
29
+ "o_proj"
30
+ ],
31
+ "task_type": "CAUSAL_LM",
32
+ "use_dora": false,
33
+ "use_rslora": false
34
+ }
checkpoint-3000/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88f40422209b44dcf6949244143f9c3a5028460571a63a7ff281dd5bf6a3becc
3
+ size 83945296
checkpoint-3000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0c72806bedfcea859d7f5efbb8f3bb2d6aca36f701f9c922e30b5589a71b6dc
3
+ size 168149074
checkpoint-3000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9196a1e708bf24d6abba41cce3f8558820acc3e50f9394c5955e29eb41ffea3d
3
+ size 14244
checkpoint-3000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:293e3fd50e74dc9b279e0b0769e2c511fe2d3e965dac5dda41bc468401725653
3
+ size 1064
checkpoint-3000/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
checkpoint-3000/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-3000/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37f00374dea48658ee8f5d0f21895b9bc55cb0103939607c8185bfd1c6ca1f89
3
+ size 587404
checkpoint-3000/tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-3000/trainer_state.json ADDED
@@ -0,0 +1,2433 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.9046004842615014,
5
+ "eval_steps": 500,
6
+ "global_step": 3000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.009685230024213076,
13
+ "grad_norm": 6.778852939605713,
14
+ "learning_rate": 2.9999227754514262e-05,
15
+ "loss": 0.8519,
16
+ "num_input_tokens_seen": 25568,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 0.01937046004842615,
21
+ "grad_norm": 3.0029561519622803,
22
+ "learning_rate": 2.9996911097572118e-05,
23
+ "loss": 0.189,
24
+ "num_input_tokens_seen": 51072,
25
+ "step": 20
26
+ },
27
+ {
28
+ "epoch": 0.029055690072639227,
29
+ "grad_norm": 5.477710247039795,
30
+ "learning_rate": 2.9993050267710624e-05,
31
+ "loss": 0.1648,
32
+ "num_input_tokens_seen": 76416,
33
+ "step": 30
34
+ },
35
+ {
36
+ "epoch": 0.0387409200968523,
37
+ "grad_norm": 4.35634183883667,
38
+ "learning_rate": 2.9987645662464235e-05,
39
+ "loss": 0.1905,
40
+ "num_input_tokens_seen": 101344,
41
+ "step": 40
42
+ },
43
+ {
44
+ "epoch": 0.048426150121065374,
45
+ "grad_norm": 4.523565292358398,
46
+ "learning_rate": 2.9980697838323884e-05,
47
+ "loss": 0.1794,
48
+ "num_input_tokens_seen": 126656,
49
+ "step": 50
50
+ },
51
+ {
52
+ "epoch": 0.05811138014527845,
53
+ "grad_norm": 1.9348187446594238,
54
+ "learning_rate": 2.9972207510679677e-05,
55
+ "loss": 0.1528,
56
+ "num_input_tokens_seen": 151200,
57
+ "step": 60
58
+ },
59
+ {
60
+ "epoch": 0.06779661016949153,
61
+ "grad_norm": 2.981433629989624,
62
+ "learning_rate": 2.996217555374725e-05,
63
+ "loss": 0.1742,
64
+ "num_input_tokens_seen": 175968,
65
+ "step": 70
66
+ },
67
+ {
68
+ "epoch": 0.0774818401937046,
69
+ "grad_norm": 3.6294591426849365,
70
+ "learning_rate": 2.9950603000477722e-05,
71
+ "loss": 0.1565,
72
+ "num_input_tokens_seen": 201280,
73
+ "step": 80
74
+ },
75
+ {
76
+ "epoch": 0.08716707021791767,
77
+ "grad_norm": 2.5459301471710205,
78
+ "learning_rate": 2.993749104245137e-05,
79
+ "loss": 0.1499,
80
+ "num_input_tokens_seen": 226432,
81
+ "step": 90
82
+ },
83
+ {
84
+ "epoch": 0.09685230024213075,
85
+ "grad_norm": 2.2721059322357178,
86
+ "learning_rate": 2.992284102975491e-05,
87
+ "loss": 0.1441,
88
+ "num_input_tokens_seen": 251744,
89
+ "step": 100
90
+ },
91
+ {
92
+ "epoch": 0.10653753026634383,
93
+ "grad_norm": 2.0033624172210693,
94
+ "learning_rate": 2.9906654470842492e-05,
95
+ "loss": 0.1245,
96
+ "num_input_tokens_seen": 276480,
97
+ "step": 110
98
+ },
99
+ {
100
+ "epoch": 0.1162227602905569,
101
+ "grad_norm": 8.585118293762207,
102
+ "learning_rate": 2.9888933032380397e-05,
103
+ "loss": 0.1333,
104
+ "num_input_tokens_seen": 301664,
105
+ "step": 120
106
+ },
107
+ {
108
+ "epoch": 0.12590799031476999,
109
+ "grad_norm": 1.423967719078064,
110
+ "learning_rate": 2.9869678539075403e-05,
111
+ "loss": 0.1728,
112
+ "num_input_tokens_seen": 326784,
113
+ "step": 130
114
+ },
115
+ {
116
+ "epoch": 0.13559322033898305,
117
+ "grad_norm": 2.6306211948394775,
118
+ "learning_rate": 2.9848892973486912e-05,
119
+ "loss": 0.1281,
120
+ "num_input_tokens_seen": 351328,
121
+ "step": 140
122
+ },
123
+ {
124
+ "epoch": 0.14527845036319612,
125
+ "grad_norm": 2.5618090629577637,
126
+ "learning_rate": 2.9826578475822825e-05,
127
+ "loss": 0.1136,
128
+ "num_input_tokens_seen": 376000,
129
+ "step": 150
130
+ },
131
+ {
132
+ "epoch": 0.1549636803874092,
133
+ "grad_norm": 2.694077730178833,
134
+ "learning_rate": 2.980273734371914e-05,
135
+ "loss": 0.1277,
136
+ "num_input_tokens_seen": 400384,
137
+ "step": 160
138
+ },
139
+ {
140
+ "epoch": 0.16464891041162227,
141
+ "grad_norm": 2.632338047027588,
142
+ "learning_rate": 2.9777372032003423e-05,
143
+ "loss": 0.1028,
144
+ "num_input_tokens_seen": 426432,
145
+ "step": 170
146
+ },
147
+ {
148
+ "epoch": 0.17433414043583534,
149
+ "grad_norm": 2.3446829319000244,
150
+ "learning_rate": 2.975048515244199e-05,
151
+ "loss": 0.1245,
152
+ "num_input_tokens_seen": 451712,
153
+ "step": 180
154
+ },
155
+ {
156
+ "epoch": 0.18401937046004843,
157
+ "grad_norm": 1.8457319736480713,
158
+ "learning_rate": 2.9722079473471035e-05,
159
+ "loss": 0.142,
160
+ "num_input_tokens_seen": 476960,
161
+ "step": 190
162
+ },
163
+ {
164
+ "epoch": 0.1937046004842615,
165
+ "grad_norm": 1.8676010370254517,
166
+ "learning_rate": 2.9692157919911536e-05,
167
+ "loss": 0.1342,
168
+ "num_input_tokens_seen": 501440,
169
+ "step": 200
170
+ },
171
+ {
172
+ "epoch": 0.2033898305084746,
173
+ "grad_norm": 4.593673229217529,
174
+ "learning_rate": 2.966072357266811e-05,
175
+ "loss": 0.1314,
176
+ "num_input_tokens_seen": 526656,
177
+ "step": 210
178
+ },
179
+ {
180
+ "epoch": 0.21307506053268765,
181
+ "grad_norm": 3.9568676948547363,
182
+ "learning_rate": 2.9627779668411795e-05,
183
+ "loss": 0.171,
184
+ "num_input_tokens_seen": 552544,
185
+ "step": 220
186
+ },
187
+ {
188
+ "epoch": 0.22276029055690072,
189
+ "grad_norm": 2.4331846237182617,
190
+ "learning_rate": 2.9593329599246766e-05,
191
+ "loss": 0.115,
192
+ "num_input_tokens_seen": 577472,
193
+ "step": 230
194
+ },
195
+ {
196
+ "epoch": 0.2324455205811138,
197
+ "grad_norm": 2.525543212890625,
198
+ "learning_rate": 2.955737691236108e-05,
199
+ "loss": 0.1158,
200
+ "num_input_tokens_seen": 601856,
201
+ "step": 240
202
+ },
203
+ {
204
+ "epoch": 0.24213075060532688,
205
+ "grad_norm": 2.2355105876922607,
206
+ "learning_rate": 2.9519925309661422e-05,
207
+ "loss": 0.111,
208
+ "num_input_tokens_seen": 627904,
209
+ "step": 250
210
+ },
211
+ {
212
+ "epoch": 0.25181598062953997,
213
+ "grad_norm": 4.165389537811279,
214
+ "learning_rate": 2.948097864739194e-05,
215
+ "loss": 0.1314,
216
+ "num_input_tokens_seen": 651936,
217
+ "step": 260
218
+ },
219
+ {
220
+ "epoch": 0.26150121065375304,
221
+ "grad_norm": 3.1712851524353027,
222
+ "learning_rate": 2.944054093573719e-05,
223
+ "loss": 0.143,
224
+ "num_input_tokens_seen": 676416,
225
+ "step": 270
226
+ },
227
+ {
228
+ "epoch": 0.2711864406779661,
229
+ "grad_norm": 2.881716728210449,
230
+ "learning_rate": 2.93986163384092e-05,
231
+ "loss": 0.1121,
232
+ "num_input_tokens_seen": 700832,
233
+ "step": 280
234
+ },
235
+ {
236
+ "epoch": 0.28087167070217917,
237
+ "grad_norm": 3.060872793197632,
238
+ "learning_rate": 2.9355209172218777e-05,
239
+ "loss": 0.1159,
240
+ "num_input_tokens_seen": 725824,
241
+ "step": 290
242
+ },
243
+ {
244
+ "epoch": 0.29055690072639223,
245
+ "grad_norm": 4.449444770812988,
246
+ "learning_rate": 2.931032390663101e-05,
247
+ "loss": 0.133,
248
+ "num_input_tokens_seen": 749408,
249
+ "step": 300
250
+ },
251
+ {
252
+ "epoch": 0.30024213075060535,
253
+ "grad_norm": 5.323568344116211,
254
+ "learning_rate": 2.926396516330506e-05,
255
+ "loss": 0.1172,
256
+ "num_input_tokens_seen": 773984,
257
+ "step": 310
258
+ },
259
+ {
260
+ "epoch": 0.3099273607748184,
261
+ "grad_norm": 3.144500732421875,
262
+ "learning_rate": 2.921613771561829e-05,
263
+ "loss": 0.136,
264
+ "num_input_tokens_seen": 799168,
265
+ "step": 320
266
+ },
267
+ {
268
+ "epoch": 0.3196125907990315,
269
+ "grad_norm": 2.433586359024048,
270
+ "learning_rate": 2.916684648817478e-05,
271
+ "loss": 0.0973,
272
+ "num_input_tokens_seen": 824320,
273
+ "step": 330
274
+ },
275
+ {
276
+ "epoch": 0.32929782082324455,
277
+ "grad_norm": 3.349472761154175,
278
+ "learning_rate": 2.9116096556298256e-05,
279
+ "loss": 0.13,
280
+ "num_input_tokens_seen": 849632,
281
+ "step": 340
282
+ },
283
+ {
284
+ "epoch": 0.3389830508474576,
285
+ "grad_norm": 1.8927061557769775,
286
+ "learning_rate": 2.9063893145509475e-05,
287
+ "loss": 0.1257,
288
+ "num_input_tokens_seen": 874400,
289
+ "step": 350
290
+ },
291
+ {
292
+ "epoch": 0.3486682808716707,
293
+ "grad_norm": 3.972686529159546,
294
+ "learning_rate": 2.901024163098822e-05,
295
+ "loss": 0.1155,
296
+ "num_input_tokens_seen": 899264,
297
+ "step": 360
298
+ },
299
+ {
300
+ "epoch": 0.3583535108958838,
301
+ "grad_norm": 1.177282452583313,
302
+ "learning_rate": 2.8955147537019815e-05,
303
+ "loss": 0.1251,
304
+ "num_input_tokens_seen": 924544,
305
+ "step": 370
306
+ },
307
+ {
308
+ "epoch": 0.36803874092009686,
309
+ "grad_norm": 1.9911576509475708,
310
+ "learning_rate": 2.88986165364263e-05,
311
+ "loss": 0.1147,
312
+ "num_input_tokens_seen": 949792,
313
+ "step": 380
314
+ },
315
+ {
316
+ "epoch": 0.37772397094430993,
317
+ "grad_norm": 2.402615785598755,
318
+ "learning_rate": 2.8840654449982344e-05,
319
+ "loss": 0.1433,
320
+ "num_input_tokens_seen": 974112,
321
+ "step": 390
322
+ },
323
+ {
324
+ "epoch": 0.387409200968523,
325
+ "grad_norm": 1.3184998035430908,
326
+ "learning_rate": 2.8781267245815898e-05,
327
+ "loss": 0.1117,
328
+ "num_input_tokens_seen": 999168,
329
+ "step": 400
330
+ },
331
+ {
332
+ "epoch": 0.39709443099273606,
333
+ "grad_norm": 1.9284625053405762,
334
+ "learning_rate": 2.8720461038793672e-05,
335
+ "loss": 0.1353,
336
+ "num_input_tokens_seen": 1024320,
337
+ "step": 410
338
+ },
339
+ {
340
+ "epoch": 0.4067796610169492,
341
+ "grad_norm": 3.1020259857177734,
342
+ "learning_rate": 2.8658242089891515e-05,
343
+ "loss": 0.1165,
344
+ "num_input_tokens_seen": 1049088,
345
+ "step": 420
346
+ },
347
+ {
348
+ "epoch": 0.41646489104116224,
349
+ "grad_norm": 2.203179359436035,
350
+ "learning_rate": 2.8594616805549752e-05,
351
+ "loss": 0.1215,
352
+ "num_input_tokens_seen": 1073632,
353
+ "step": 430
354
+ },
355
+ {
356
+ "epoch": 0.4261501210653753,
357
+ "grad_norm": 2.053194522857666,
358
+ "learning_rate": 2.8529591737013526e-05,
359
+ "loss": 0.1066,
360
+ "num_input_tokens_seen": 1098208,
361
+ "step": 440
362
+ },
363
+ {
364
+ "epoch": 0.4358353510895884,
365
+ "grad_norm": 2.780935049057007,
366
+ "learning_rate": 2.8463173579658258e-05,
367
+ "loss": 0.0879,
368
+ "num_input_tokens_seen": 1122336,
369
+ "step": 450
370
+ },
371
+ {
372
+ "epoch": 0.44552058111380144,
373
+ "grad_norm": 1.9929611682891846,
374
+ "learning_rate": 2.8395369172300235e-05,
375
+ "loss": 0.1141,
376
+ "num_input_tokens_seen": 1147392,
377
+ "step": 460
378
+ },
379
+ {
380
+ "epoch": 0.4552058111380145,
381
+ "grad_norm": 1.1469779014587402,
382
+ "learning_rate": 2.8326185496492464e-05,
383
+ "loss": 0.1052,
384
+ "num_input_tokens_seen": 1173248,
385
+ "step": 470
386
+ },
387
+ {
388
+ "epoch": 0.4648910411622276,
389
+ "grad_norm": 2.501117706298828,
390
+ "learning_rate": 2.825562967580579e-05,
391
+ "loss": 0.1086,
392
+ "num_input_tokens_seen": 1197984,
393
+ "step": 480
394
+ },
395
+ {
396
+ "epoch": 0.4745762711864407,
397
+ "grad_norm": 2.0266308784484863,
398
+ "learning_rate": 2.8183708975095406e-05,
399
+ "loss": 0.1201,
400
+ "num_input_tokens_seen": 1222720,
401
+ "step": 490
402
+ },
403
+ {
404
+ "epoch": 0.48426150121065376,
405
+ "grad_norm": 1.1120251417160034,
406
+ "learning_rate": 2.8110430799752845e-05,
407
+ "loss": 0.1319,
408
+ "num_input_tokens_seen": 1247232,
409
+ "step": 500
410
+ },
411
+ {
412
+ "epoch": 0.4939467312348668,
413
+ "grad_norm": 1.2014496326446533,
414
+ "learning_rate": 2.8035802694943457e-05,
415
+ "loss": 0.1071,
416
+ "num_input_tokens_seen": 1273184,
417
+ "step": 510
418
+ },
419
+ {
420
+ "epoch": 0.5036319612590799,
421
+ "grad_norm": 1.1245245933532715,
422
+ "learning_rate": 2.7959832344829512e-05,
423
+ "loss": 0.1554,
424
+ "num_input_tokens_seen": 1298688,
425
+ "step": 520
426
+ },
427
+ {
428
+ "epoch": 0.513317191283293,
429
+ "grad_norm": 2.031115770339966,
430
+ "learning_rate": 2.7882527571779003e-05,
431
+ "loss": 0.1196,
432
+ "num_input_tokens_seen": 1324128,
433
+ "step": 530
434
+ },
435
+ {
436
+ "epoch": 0.5230024213075061,
437
+ "grad_norm": 1.7691289186477661,
438
+ "learning_rate": 2.78038963355602e-05,
439
+ "loss": 0.1334,
440
+ "num_input_tokens_seen": 1349120,
441
+ "step": 540
442
+ },
443
+ {
444
+ "epoch": 0.5326876513317191,
445
+ "grad_norm": 2.9496989250183105,
446
+ "learning_rate": 2.7723946732522055e-05,
447
+ "loss": 0.1109,
448
+ "num_input_tokens_seen": 1374304,
449
+ "step": 550
450
+ },
451
+ {
452
+ "epoch": 0.5423728813559322,
453
+ "grad_norm": 2.2881715297698975,
454
+ "learning_rate": 2.764268699476058e-05,
455
+ "loss": 0.1274,
456
+ "num_input_tokens_seen": 1399136,
457
+ "step": 560
458
+ },
459
+ {
460
+ "epoch": 0.5520581113801453,
461
+ "grad_norm": 1.9754095077514648,
462
+ "learning_rate": 2.756012548927119e-05,
463
+ "loss": 0.1397,
464
+ "num_input_tokens_seen": 1424672,
465
+ "step": 570
466
+ },
467
+ {
468
+ "epoch": 0.5617433414043583,
469
+ "grad_norm": 1.9883428812026978,
470
+ "learning_rate": 2.7476270717087215e-05,
471
+ "loss": 0.101,
472
+ "num_input_tokens_seen": 1449024,
473
+ "step": 580
474
+ },
475
+ {
476
+ "epoch": 0.5714285714285714,
477
+ "grad_norm": 0.9653130769729614,
478
+ "learning_rate": 2.7391131312404556e-05,
479
+ "loss": 0.0941,
480
+ "num_input_tokens_seen": 1475264,
481
+ "step": 590
482
+ },
483
+ {
484
+ "epoch": 0.5811138014527845,
485
+ "grad_norm": 4.576601028442383,
486
+ "learning_rate": 2.7304716041692663e-05,
487
+ "loss": 0.0865,
488
+ "num_input_tokens_seen": 1500064,
489
+ "step": 600
490
+ },
491
+ {
492
+ "epoch": 0.5907990314769975,
493
+ "grad_norm": 2.4046311378479004,
494
+ "learning_rate": 2.7217033802791906e-05,
495
+ "loss": 0.1596,
496
+ "num_input_tokens_seen": 1524448,
497
+ "step": 610
498
+ },
499
+ {
500
+ "epoch": 0.6004842615012107,
501
+ "grad_norm": 1.7785555124282837,
502
+ "learning_rate": 2.7128093623997368e-05,
503
+ "loss": 0.0891,
504
+ "num_input_tokens_seen": 1549536,
505
+ "step": 620
506
+ },
507
+ {
508
+ "epoch": 0.6101694915254238,
509
+ "grad_norm": 2.2736170291900635,
510
+ "learning_rate": 2.7037904663129262e-05,
511
+ "loss": 0.1085,
512
+ "num_input_tokens_seen": 1573408,
513
+ "step": 630
514
+ },
515
+ {
516
+ "epoch": 0.6198547215496368,
517
+ "grad_norm": 1.0862345695495605,
518
+ "learning_rate": 2.6946476206589972e-05,
519
+ "loss": 0.1023,
520
+ "num_input_tokens_seen": 1597888,
521
+ "step": 640
522
+ },
523
+ {
524
+ "epoch": 0.6295399515738499,
525
+ "grad_norm": 0.5358290672302246,
526
+ "learning_rate": 2.6853817668407875e-05,
527
+ "loss": 0.0669,
528
+ "num_input_tokens_seen": 1623296,
529
+ "step": 650
530
+ },
531
+ {
532
+ "epoch": 0.639225181598063,
533
+ "grad_norm": 2.3138749599456787,
534
+ "learning_rate": 2.6759938589268023e-05,
535
+ "loss": 0.1017,
536
+ "num_input_tokens_seen": 1649216,
537
+ "step": 660
538
+ },
539
+ {
540
+ "epoch": 0.648910411622276,
541
+ "grad_norm": 3.2054226398468018,
542
+ "learning_rate": 2.6664848635529742e-05,
543
+ "loss": 0.1432,
544
+ "num_input_tokens_seen": 1673760,
545
+ "step": 670
546
+ },
547
+ {
548
+ "epoch": 0.6585956416464891,
549
+ "grad_norm": 1.8352829217910767,
550
+ "learning_rate": 2.6568557598231385e-05,
551
+ "loss": 0.1081,
552
+ "num_input_tokens_seen": 1698592,
553
+ "step": 680
554
+ },
555
+ {
556
+ "epoch": 0.6682808716707022,
557
+ "grad_norm": 1.203284740447998,
558
+ "learning_rate": 2.6471075392082125e-05,
559
+ "loss": 0.1037,
560
+ "num_input_tokens_seen": 1723296,
561
+ "step": 690
562
+ },
563
+ {
564
+ "epoch": 0.6779661016949152,
565
+ "grad_norm": 1.635628581047058,
566
+ "learning_rate": 2.6372412054441116e-05,
567
+ "loss": 0.1216,
568
+ "num_input_tokens_seen": 1748384,
569
+ "step": 700
570
+ },
571
+ {
572
+ "epoch": 0.6876513317191283,
573
+ "grad_norm": 0.8993457555770874,
574
+ "learning_rate": 2.6272577744283965e-05,
575
+ "loss": 0.0853,
576
+ "num_input_tokens_seen": 1773600,
577
+ "step": 710
578
+ },
579
+ {
580
+ "epoch": 0.6973365617433414,
581
+ "grad_norm": 1.7306419610977173,
582
+ "learning_rate": 2.617158274115673e-05,
583
+ "loss": 0.1034,
584
+ "num_input_tokens_seen": 1798656,
585
+ "step": 720
586
+ },
587
+ {
588
+ "epoch": 0.7070217917675545,
589
+ "grad_norm": 2.770066976547241,
590
+ "learning_rate": 2.6069437444117432e-05,
591
+ "loss": 0.0872,
592
+ "num_input_tokens_seen": 1824544,
593
+ "step": 730
594
+ },
595
+ {
596
+ "epoch": 0.7167070217917676,
597
+ "grad_norm": 2.3590221405029297,
598
+ "learning_rate": 2.596615237066535e-05,
599
+ "loss": 0.1063,
600
+ "num_input_tokens_seen": 1848896,
601
+ "step": 740
602
+ },
603
+ {
604
+ "epoch": 0.7263922518159807,
605
+ "grad_norm": 1.0496519804000854,
606
+ "learning_rate": 2.586173815565805e-05,
607
+ "loss": 0.1104,
608
+ "num_input_tokens_seen": 1873248,
609
+ "step": 750
610
+ },
611
+ {
612
+ "epoch": 0.7360774818401937,
613
+ "grad_norm": 1.513573408126831,
614
+ "learning_rate": 2.575620555021634e-05,
615
+ "loss": 0.1125,
616
+ "num_input_tokens_seen": 1897184,
617
+ "step": 760
618
+ },
619
+ {
620
+ "epoch": 0.7457627118644068,
621
+ "grad_norm": 1.5545728206634521,
622
+ "learning_rate": 2.564956542061732e-05,
623
+ "loss": 0.0969,
624
+ "num_input_tokens_seen": 1922368,
625
+ "step": 770
626
+ },
627
+ {
628
+ "epoch": 0.7554479418886199,
629
+ "grad_norm": 1.9260263442993164,
630
+ "learning_rate": 2.5541828747175477e-05,
631
+ "loss": 0.1142,
632
+ "num_input_tokens_seen": 1947904,
633
+ "step": 780
634
+ },
635
+ {
636
+ "epoch": 0.7651331719128329,
637
+ "grad_norm": 2.396538734436035,
638
+ "learning_rate": 2.543300662311211e-05,
639
+ "loss": 0.0926,
640
+ "num_input_tokens_seen": 1971872,
641
+ "step": 790
642
+ },
643
+ {
644
+ "epoch": 0.774818401937046,
645
+ "grad_norm": 1.7069965600967407,
646
+ "learning_rate": 2.532311025341309e-05,
647
+ "loss": 0.0802,
648
+ "num_input_tokens_seen": 1996352,
649
+ "step": 800
650
+ },
651
+ {
652
+ "epoch": 0.784503631961259,
653
+ "grad_norm": 5.540910243988037,
654
+ "learning_rate": 2.5212150953675133e-05,
655
+ "loss": 0.1248,
656
+ "num_input_tokens_seen": 2020480,
657
+ "step": 810
658
+ },
659
+ {
660
+ "epoch": 0.7941888619854721,
661
+ "grad_norm": 1.7795952558517456,
662
+ "learning_rate": 2.5100140148940688e-05,
663
+ "loss": 0.0767,
664
+ "num_input_tokens_seen": 2044448,
665
+ "step": 820
666
+ },
667
+ {
668
+ "epoch": 0.8038740920096852,
669
+ "grad_norm": 2.7387983798980713,
670
+ "learning_rate": 2.498708937252153e-05,
671
+ "loss": 0.1239,
672
+ "num_input_tokens_seen": 2070400,
673
+ "step": 830
674
+ },
675
+ {
676
+ "epoch": 0.8135593220338984,
677
+ "grad_norm": 2.1243462562561035,
678
+ "learning_rate": 2.4873010264811222e-05,
679
+ "loss": 0.108,
680
+ "num_input_tokens_seen": 2095392,
681
+ "step": 840
682
+ },
683
+ {
684
+ "epoch": 0.8232445520581114,
685
+ "grad_norm": 0.9928631782531738,
686
+ "learning_rate": 2.4757914572086555e-05,
687
+ "loss": 0.0994,
688
+ "num_input_tokens_seen": 2120192,
689
+ "step": 850
690
+ },
691
+ {
692
+ "epoch": 0.8329297820823245,
693
+ "grad_norm": 6.047460556030273,
694
+ "learning_rate": 2.464181414529809e-05,
695
+ "loss": 0.0927,
696
+ "num_input_tokens_seen": 2144384,
697
+ "step": 860
698
+ },
699
+ {
700
+ "epoch": 0.8426150121065376,
701
+ "grad_norm": 2.2197115421295166,
702
+ "learning_rate": 2.4524720938849883e-05,
703
+ "loss": 0.1328,
704
+ "num_input_tokens_seen": 2168704,
705
+ "step": 870
706
+ },
707
+ {
708
+ "epoch": 0.8523002421307506,
709
+ "grad_norm": 2.0752601623535156,
710
+ "learning_rate": 2.440664700936861e-05,
711
+ "loss": 0.1229,
712
+ "num_input_tokens_seen": 2193248,
713
+ "step": 880
714
+ },
715
+ {
716
+ "epoch": 0.8619854721549637,
717
+ "grad_norm": 1.00425386428833,
718
+ "learning_rate": 2.4287604514462152e-05,
719
+ "loss": 0.0957,
720
+ "num_input_tokens_seen": 2217568,
721
+ "step": 890
722
+ },
723
+ {
724
+ "epoch": 0.8716707021791767,
725
+ "grad_norm": 1.9153094291687012,
726
+ "learning_rate": 2.416760571146774e-05,
727
+ "loss": 0.0975,
728
+ "num_input_tokens_seen": 2242048,
729
+ "step": 900
730
+ },
731
+ {
732
+ "epoch": 0.8813559322033898,
733
+ "grad_norm": 2.3558013439178467,
734
+ "learning_rate": 2.4046662956189898e-05,
735
+ "loss": 0.1068,
736
+ "num_input_tokens_seen": 2266112,
737
+ "step": 910
738
+ },
739
+ {
740
+ "epoch": 0.8910411622276029,
741
+ "grad_norm": 2.546351909637451,
742
+ "learning_rate": 2.3924788701628197e-05,
743
+ "loss": 0.0688,
744
+ "num_input_tokens_seen": 2290720,
745
+ "step": 920
746
+ },
747
+ {
748
+ "epoch": 0.9007263922518159,
749
+ "grad_norm": 1.2526168823242188,
750
+ "learning_rate": 2.3801995496695028e-05,
751
+ "loss": 0.1141,
752
+ "num_input_tokens_seen": 2315488,
753
+ "step": 930
754
+ },
755
+ {
756
+ "epoch": 0.910411622276029,
757
+ "grad_norm": 2.134089231491089,
758
+ "learning_rate": 2.367829598492348e-05,
759
+ "loss": 0.1328,
760
+ "num_input_tokens_seen": 2340992,
761
+ "step": 940
762
+ },
763
+ {
764
+ "epoch": 0.9200968523002422,
765
+ "grad_norm": 1.332915186882019,
766
+ "learning_rate": 2.3553702903165502e-05,
767
+ "loss": 0.1,
768
+ "num_input_tokens_seen": 2366880,
769
+ "step": 950
770
+ },
771
+ {
772
+ "epoch": 0.9297820823244553,
773
+ "grad_norm": 1.5140970945358276,
774
+ "learning_rate": 2.3428229080280407e-05,
775
+ "loss": 0.1089,
776
+ "num_input_tokens_seen": 2392000,
777
+ "step": 960
778
+ },
779
+ {
780
+ "epoch": 0.9394673123486683,
781
+ "grad_norm": 1.531954288482666,
782
+ "learning_rate": 2.330188743581398e-05,
783
+ "loss": 0.0924,
784
+ "num_input_tokens_seen": 2417472,
785
+ "step": 970
786
+ },
787
+ {
788
+ "epoch": 0.9491525423728814,
789
+ "grad_norm": 1.3347736597061157,
790
+ "learning_rate": 2.3174690978668155e-05,
791
+ "loss": 0.1205,
792
+ "num_input_tokens_seen": 2442496,
793
+ "step": 980
794
+ },
795
+ {
796
+ "epoch": 0.9588377723970944,
797
+ "grad_norm": 3.1497702598571777,
798
+ "learning_rate": 2.3046652805761588e-05,
799
+ "loss": 0.1004,
800
+ "num_input_tokens_seen": 2467392,
801
+ "step": 990
802
+ },
803
+ {
804
+ "epoch": 0.9685230024213075,
805
+ "grad_norm": 1.6756023168563843,
806
+ "learning_rate": 2.2917786100681078e-05,
807
+ "loss": 0.1007,
808
+ "num_input_tokens_seen": 2492768,
809
+ "step": 1000
810
+ },
811
+ {
812
+ "epoch": 0.9782082324455206,
813
+ "grad_norm": 2.56594181060791,
814
+ "learning_rate": 2.2788104132324125e-05,
815
+ "loss": 0.1179,
816
+ "num_input_tokens_seen": 2518176,
817
+ "step": 1010
818
+ },
819
+ {
820
+ "epoch": 0.9878934624697336,
821
+ "grad_norm": 2.1090595722198486,
822
+ "learning_rate": 2.2657620253532685e-05,
823
+ "loss": 0.0971,
824
+ "num_input_tokens_seen": 2543296,
825
+ "step": 1020
826
+ },
827
+ {
828
+ "epoch": 0.9975786924939467,
829
+ "grad_norm": 0.41959595680236816,
830
+ "learning_rate": 2.252634789971827e-05,
831
+ "loss": 0.0932,
832
+ "num_input_tokens_seen": 2567680,
833
+ "step": 1030
834
+ },
835
+ {
836
+ "epoch": 1.006779661016949,
837
+ "grad_norm": 1.6389803886413574,
838
+ "learning_rate": 2.2394300587478566e-05,
839
+ "loss": 0.0924,
840
+ "num_input_tokens_seen": 2591016,
841
+ "step": 1040
842
+ },
843
+ {
844
+ "epoch": 1.0164648910411622,
845
+ "grad_norm": 1.4045557975769043,
846
+ "learning_rate": 2.2261491913205684e-05,
847
+ "loss": 0.0985,
848
+ "num_input_tokens_seen": 2615752,
849
+ "step": 1050
850
+ },
851
+ {
852
+ "epoch": 1.0261501210653754,
853
+ "grad_norm": 2.0734925270080566,
854
+ "learning_rate": 2.212793555168617e-05,
855
+ "loss": 0.0853,
856
+ "num_input_tokens_seen": 2640200,
857
+ "step": 1060
858
+ },
859
+ {
860
+ "epoch": 1.0358353510895884,
861
+ "grad_norm": 2.1590147018432617,
862
+ "learning_rate": 2.1993645254692994e-05,
863
+ "loss": 0.116,
864
+ "num_input_tokens_seen": 2665416,
865
+ "step": 1070
866
+ },
867
+ {
868
+ "epoch": 1.0455205811138015,
869
+ "grad_norm": 1.739646553993225,
870
+ "learning_rate": 2.1858634849569578e-05,
871
+ "loss": 0.0972,
872
+ "num_input_tokens_seen": 2690376,
873
+ "step": 1080
874
+ },
875
+ {
876
+ "epoch": 1.0552058111380145,
877
+ "grad_norm": 0.6458954215049744,
878
+ "learning_rate": 2.1722918237806042e-05,
879
+ "loss": 0.0884,
880
+ "num_input_tokens_seen": 2715080,
881
+ "step": 1090
882
+ },
883
+ {
884
+ "epoch": 1.0648910411622277,
885
+ "grad_norm": 2.2830138206481934,
886
+ "learning_rate": 2.158650939360782e-05,
887
+ "loss": 0.073,
888
+ "num_input_tokens_seen": 2740424,
889
+ "step": 1100
890
+ },
891
+ {
892
+ "epoch": 1.0745762711864406,
893
+ "grad_norm": 1.5225194692611694,
894
+ "learning_rate": 2.1449422362456794e-05,
895
+ "loss": 0.0813,
896
+ "num_input_tokens_seen": 2765640,
897
+ "step": 1110
898
+ },
899
+ {
900
+ "epoch": 1.0842615012106538,
901
+ "grad_norm": 1.683604121208191,
902
+ "learning_rate": 2.13116712596651e-05,
903
+ "loss": 0.0953,
904
+ "num_input_tokens_seen": 2791176,
905
+ "step": 1120
906
+ },
907
+ {
908
+ "epoch": 1.0939467312348667,
909
+ "grad_norm": 1.5679166316986084,
910
+ "learning_rate": 2.1173270268921703e-05,
911
+ "loss": 0.0933,
912
+ "num_input_tokens_seen": 2816072,
913
+ "step": 1130
914
+ },
915
+ {
916
+ "epoch": 1.10363196125908,
917
+ "grad_norm": 1.3097947835922241,
918
+ "learning_rate": 2.1034233640831988e-05,
919
+ "loss": 0.0819,
920
+ "num_input_tokens_seen": 2840776,
921
+ "step": 1140
922
+ },
923
+ {
924
+ "epoch": 1.113317191283293,
925
+ "grad_norm": 0.5728388428688049,
926
+ "learning_rate": 2.0894575691450396e-05,
927
+ "loss": 0.0611,
928
+ "num_input_tokens_seen": 2865416,
929
+ "step": 1150
930
+ },
931
+ {
932
+ "epoch": 1.123002421307506,
933
+ "grad_norm": 2.3043558597564697,
934
+ "learning_rate": 2.0754310800806395e-05,
935
+ "loss": 0.0748,
936
+ "num_input_tokens_seen": 2890248,
937
+ "step": 1160
938
+ },
939
+ {
940
+ "epoch": 1.1326876513317192,
941
+ "grad_norm": 1.2087112665176392,
942
+ "learning_rate": 2.0613453411423797e-05,
943
+ "loss": 0.0959,
944
+ "num_input_tokens_seen": 2916392,
945
+ "step": 1170
946
+ },
947
+ {
948
+ "epoch": 1.1423728813559322,
949
+ "grad_norm": 1.5639240741729736,
950
+ "learning_rate": 2.0472018026833684e-05,
951
+ "loss": 0.0709,
952
+ "num_input_tokens_seen": 2941160,
953
+ "step": 1180
954
+ },
955
+ {
956
+ "epoch": 1.1520581113801454,
957
+ "grad_norm": 0.5889459848403931,
958
+ "learning_rate": 2.0330019210081022e-05,
959
+ "loss": 0.0731,
960
+ "num_input_tokens_seen": 2966120,
961
+ "step": 1190
962
+ },
963
+ {
964
+ "epoch": 1.1617433414043583,
965
+ "grad_norm": 1.854230523109436,
966
+ "learning_rate": 2.0187471582225173e-05,
967
+ "loss": 0.1005,
968
+ "num_input_tokens_seen": 2990088,
969
+ "step": 1200
970
+ },
971
+ {
972
+ "epoch": 1.1714285714285715,
973
+ "grad_norm": 2.01247239112854,
974
+ "learning_rate": 2.004438982083442e-05,
975
+ "loss": 0.0579,
976
+ "num_input_tokens_seen": 3015400,
977
+ "step": 1210
978
+ },
979
+ {
980
+ "epoch": 1.1811138014527844,
981
+ "grad_norm": 2.292900323867798,
982
+ "learning_rate": 1.9900788658474677e-05,
983
+ "loss": 0.0792,
984
+ "num_input_tokens_seen": 3039464,
985
+ "step": 1220
986
+ },
987
+ {
988
+ "epoch": 1.1907990314769976,
989
+ "grad_norm": 1.4194159507751465,
990
+ "learning_rate": 1.975668288119252e-05,
991
+ "loss": 0.057,
992
+ "num_input_tokens_seen": 3063816,
993
+ "step": 1230
994
+ },
995
+ {
996
+ "epoch": 1.2004842615012106,
997
+ "grad_norm": 1.0512489080429077,
998
+ "learning_rate": 1.961208732699275e-05,
999
+ "loss": 0.102,
1000
+ "num_input_tokens_seen": 3088968,
1001
+ "step": 1240
1002
+ },
1003
+ {
1004
+ "epoch": 1.2101694915254237,
1005
+ "grad_norm": 0.9465106129646301,
1006
+ "learning_rate": 1.9467016884310565e-05,
1007
+ "loss": 0.0691,
1008
+ "num_input_tokens_seen": 3113736,
1009
+ "step": 1250
1010
+ },
1011
+ {
1012
+ "epoch": 1.2198547215496367,
1013
+ "grad_norm": 1.274294376373291,
1014
+ "learning_rate": 1.9321486490478565e-05,
1015
+ "loss": 0.0668,
1016
+ "num_input_tokens_seen": 3138344,
1017
+ "step": 1260
1018
+ },
1019
+ {
1020
+ "epoch": 1.2295399515738499,
1021
+ "grad_norm": 1.9390579462051392,
1022
+ "learning_rate": 1.91755111301887e-05,
1023
+ "loss": 0.0711,
1024
+ "num_input_tokens_seen": 3163496,
1025
+ "step": 1270
1026
+ },
1027
+ {
1028
+ "epoch": 1.239225181598063,
1029
+ "grad_norm": 1.2855744361877441,
1030
+ "learning_rate": 1.902910583394938e-05,
1031
+ "loss": 0.0605,
1032
+ "num_input_tokens_seen": 3188392,
1033
+ "step": 1280
1034
+ },
1035
+ {
1036
+ "epoch": 1.248910411622276,
1037
+ "grad_norm": 2.931248188018799,
1038
+ "learning_rate": 1.888228567653781e-05,
1039
+ "loss": 0.0448,
1040
+ "num_input_tokens_seen": 3213224,
1041
+ "step": 1290
1042
+ },
1043
+ {
1044
+ "epoch": 1.2585956416464892,
1045
+ "grad_norm": 1.9991300106048584,
1046
+ "learning_rate": 1.873506577544784e-05,
1047
+ "loss": 0.0815,
1048
+ "num_input_tokens_seen": 3238568,
1049
+ "step": 1300
1050
+ },
1051
+ {
1052
+ "epoch": 1.2682808716707021,
1053
+ "grad_norm": 1.3530927896499634,
1054
+ "learning_rate": 1.8587461289333327e-05,
1055
+ "loss": 0.1043,
1056
+ "num_input_tokens_seen": 3264264,
1057
+ "step": 1310
1058
+ },
1059
+ {
1060
+ "epoch": 1.2779661016949153,
1061
+ "grad_norm": 2.07991099357605,
1062
+ "learning_rate": 1.8439487416447353e-05,
1063
+ "loss": 0.1037,
1064
+ "num_input_tokens_seen": 3288840,
1065
+ "step": 1320
1066
+ },
1067
+ {
1068
+ "epoch": 1.2876513317191283,
1069
+ "grad_norm": 1.8533947467803955,
1070
+ "learning_rate": 1.8291159393077294e-05,
1071
+ "loss": 0.0928,
1072
+ "num_input_tokens_seen": 3313832,
1073
+ "step": 1330
1074
+ },
1075
+ {
1076
+ "epoch": 1.2973365617433414,
1077
+ "grad_norm": 1.118119716644287,
1078
+ "learning_rate": 1.814249249197602e-05,
1079
+ "loss": 0.0775,
1080
+ "num_input_tokens_seen": 3337736,
1081
+ "step": 1340
1082
+ },
1083
+ {
1084
+ "epoch": 1.3070217917675544,
1085
+ "grad_norm": 2.740079641342163,
1086
+ "learning_rate": 1.7993502020789294e-05,
1087
+ "loss": 0.0521,
1088
+ "num_input_tokens_seen": 3362024,
1089
+ "step": 1350
1090
+ },
1091
+ {
1092
+ "epoch": 1.3167070217917676,
1093
+ "grad_norm": 1.9268351793289185,
1094
+ "learning_rate": 1.7844203320479614e-05,
1095
+ "loss": 0.0687,
1096
+ "num_input_tokens_seen": 3387496,
1097
+ "step": 1360
1098
+ },
1099
+ {
1100
+ "epoch": 1.3263922518159807,
1101
+ "grad_norm": 2.3576388359069824,
1102
+ "learning_rate": 1.7694611763746632e-05,
1103
+ "loss": 0.0704,
1104
+ "num_input_tokens_seen": 3412072,
1105
+ "step": 1370
1106
+ },
1107
+ {
1108
+ "epoch": 1.3360774818401937,
1109
+ "grad_norm": 1.127432942390442,
1110
+ "learning_rate": 1.754474275344427e-05,
1111
+ "loss": 0.0826,
1112
+ "num_input_tokens_seen": 3437096,
1113
+ "step": 1380
1114
+ },
1115
+ {
1116
+ "epoch": 1.3457627118644067,
1117
+ "grad_norm": 4.377537250518799,
1118
+ "learning_rate": 1.7394611720994747e-05,
1119
+ "loss": 0.0445,
1120
+ "num_input_tokens_seen": 3462120,
1121
+ "step": 1390
1122
+ },
1123
+ {
1124
+ "epoch": 1.3554479418886198,
1125
+ "grad_norm": 2.1285200119018555,
1126
+ "learning_rate": 1.724423412479967e-05,
1127
+ "loss": 0.0951,
1128
+ "num_input_tokens_seen": 3486952,
1129
+ "step": 1400
1130
+ },
1131
+ {
1132
+ "epoch": 1.365133171912833,
1133
+ "grad_norm": 0.16216270625591278,
1134
+ "learning_rate": 1.7093625448648348e-05,
1135
+ "loss": 0.0539,
1136
+ "num_input_tokens_seen": 3512264,
1137
+ "step": 1410
1138
+ },
1139
+ {
1140
+ "epoch": 1.374818401937046,
1141
+ "grad_norm": 2.1299915313720703,
1142
+ "learning_rate": 1.694280120012349e-05,
1143
+ "loss": 0.0848,
1144
+ "num_input_tokens_seen": 3537192,
1145
+ "step": 1420
1146
+ },
1147
+ {
1148
+ "epoch": 1.3845036319612591,
1149
+ "grad_norm": 2.476757049560547,
1150
+ "learning_rate": 1.6791776909004434e-05,
1151
+ "loss": 0.0629,
1152
+ "num_input_tokens_seen": 3560872,
1153
+ "step": 1430
1154
+ },
1155
+ {
1156
+ "epoch": 1.394188861985472,
1157
+ "grad_norm": 0.4373377561569214,
1158
+ "learning_rate": 1.664056812566812e-05,
1159
+ "loss": 0.079,
1160
+ "num_input_tokens_seen": 3586216,
1161
+ "step": 1440
1162
+ },
1163
+ {
1164
+ "epoch": 1.4038740920096853,
1165
+ "grad_norm": 1.9471170902252197,
1166
+ "learning_rate": 1.648919041948792e-05,
1167
+ "loss": 0.0798,
1168
+ "num_input_tokens_seen": 3610792,
1169
+ "step": 1450
1170
+ },
1171
+ {
1172
+ "epoch": 1.4135593220338982,
1173
+ "grad_norm": 2.911750316619873,
1174
+ "learning_rate": 1.6337659377230544e-05,
1175
+ "loss": 0.0897,
1176
+ "num_input_tokens_seen": 3634760,
1177
+ "step": 1460
1178
+ },
1179
+ {
1180
+ "epoch": 1.4232445520581114,
1181
+ "grad_norm": 2.9474802017211914,
1182
+ "learning_rate": 1.61859906014511e-05,
1183
+ "loss": 0.0858,
1184
+ "num_input_tokens_seen": 3659560,
1185
+ "step": 1470
1186
+ },
1187
+ {
1188
+ "epoch": 1.4329297820823244,
1189
+ "grad_norm": 0.6501768827438354,
1190
+ "learning_rate": 1.6034199708886573e-05,
1191
+ "loss": 0.0532,
1192
+ "num_input_tokens_seen": 3684840,
1193
+ "step": 1480
1194
+ },
1195
+ {
1196
+ "epoch": 1.4426150121065375,
1197
+ "grad_norm": 1.6708017587661743,
1198
+ "learning_rate": 1.5882302328847847e-05,
1199
+ "loss": 0.0842,
1200
+ "num_input_tokens_seen": 3709096,
1201
+ "step": 1490
1202
+ },
1203
+ {
1204
+ "epoch": 1.4523002421307507,
1205
+ "grad_norm": 1.5014967918395996,
1206
+ "learning_rate": 1.5730314101610376e-05,
1207
+ "loss": 0.0367,
1208
+ "num_input_tokens_seen": 3734728,
1209
+ "step": 1500
1210
+ },
1211
+ {
1212
+ "epoch": 1.4619854721549637,
1213
+ "grad_norm": 3.2587804794311523,
1214
+ "learning_rate": 1.5578250676803824e-05,
1215
+ "loss": 0.1085,
1216
+ "num_input_tokens_seen": 3758984,
1217
+ "step": 1510
1218
+ },
1219
+ {
1220
+ "epoch": 1.4716707021791768,
1221
+ "grad_norm": 6.304242134094238,
1222
+ "learning_rate": 1.5426127711800636e-05,
1223
+ "loss": 0.0712,
1224
+ "num_input_tokens_seen": 3784296,
1225
+ "step": 1520
1226
+ },
1227
+ {
1228
+ "epoch": 1.4813559322033898,
1229
+ "grad_norm": 1.1681016683578491,
1230
+ "learning_rate": 1.5273960870103872e-05,
1231
+ "loss": 0.0705,
1232
+ "num_input_tokens_seen": 3809768,
1233
+ "step": 1530
1234
+ },
1235
+ {
1236
+ "epoch": 1.491041162227603,
1237
+ "grad_norm": 1.111617922782898,
1238
+ "learning_rate": 1.5121765819734418e-05,
1239
+ "loss": 0.071,
1240
+ "num_input_tokens_seen": 3834536,
1241
+ "step": 1540
1242
+ },
1243
+ {
1244
+ "epoch": 1.5007263922518161,
1245
+ "grad_norm": 1.7780523300170898,
1246
+ "learning_rate": 1.4969558231617681e-05,
1247
+ "loss": 0.0648,
1248
+ "num_input_tokens_seen": 3858792,
1249
+ "step": 1550
1250
+ },
1251
+ {
1252
+ "epoch": 1.510411622276029,
1253
+ "grad_norm": 2.2017934322357178,
1254
+ "learning_rate": 1.4817353777970038e-05,
1255
+ "loss": 0.0633,
1256
+ "num_input_tokens_seen": 3883976,
1257
+ "step": 1560
1258
+ },
1259
+ {
1260
+ "epoch": 1.520096852300242,
1261
+ "grad_norm": 1.8567978143692017,
1262
+ "learning_rate": 1.466516813068512e-05,
1263
+ "loss": 0.0726,
1264
+ "num_input_tokens_seen": 3908392,
1265
+ "step": 1570
1266
+ },
1267
+ {
1268
+ "epoch": 1.5297820823244552,
1269
+ "grad_norm": 2.567291021347046,
1270
+ "learning_rate": 1.451301695972015e-05,
1271
+ "loss": 0.0882,
1272
+ "num_input_tokens_seen": 3932552,
1273
+ "step": 1580
1274
+ },
1275
+ {
1276
+ "epoch": 1.5394673123486684,
1277
+ "grad_norm": 1.9968935251235962,
1278
+ "learning_rate": 1.436091593148244e-05,
1279
+ "loss": 0.1149,
1280
+ "num_input_tokens_seen": 3957672,
1281
+ "step": 1590
1282
+ },
1283
+ {
1284
+ "epoch": 1.5491525423728814,
1285
+ "grad_norm": 1.9058917760849,
1286
+ "learning_rate": 1.4208880707216323e-05,
1287
+ "loss": 0.0841,
1288
+ "num_input_tokens_seen": 3982824,
1289
+ "step": 1600
1290
+ },
1291
+ {
1292
+ "epoch": 1.5588377723970943,
1293
+ "grad_norm": 1.9218000173568726,
1294
+ "learning_rate": 1.405692694139054e-05,
1295
+ "loss": 0.0896,
1296
+ "num_input_tokens_seen": 4008072,
1297
+ "step": 1610
1298
+ },
1299
+ {
1300
+ "epoch": 1.5685230024213075,
1301
+ "grad_norm": 1.5786553621292114,
1302
+ "learning_rate": 1.3905070280086387e-05,
1303
+ "loss": 0.0629,
1304
+ "num_input_tokens_seen": 4033096,
1305
+ "step": 1620
1306
+ },
1307
+ {
1308
+ "epoch": 1.5782082324455207,
1309
+ "grad_norm": 2.503990888595581,
1310
+ "learning_rate": 1.3753326359386695e-05,
1311
+ "loss": 0.077,
1312
+ "num_input_tokens_seen": 4058120,
1313
+ "step": 1630
1314
+ },
1315
+ {
1316
+ "epoch": 1.5878934624697336,
1317
+ "grad_norm": 1.5616143941879272,
1318
+ "learning_rate": 1.3601710803765814e-05,
1319
+ "loss": 0.0853,
1320
+ "num_input_tokens_seen": 4082792,
1321
+ "step": 1640
1322
+ },
1323
+ {
1324
+ "epoch": 1.5975786924939466,
1325
+ "grad_norm": 1.2533211708068848,
1326
+ "learning_rate": 1.3450239224480884e-05,
1327
+ "loss": 0.0605,
1328
+ "num_input_tokens_seen": 4107336,
1329
+ "step": 1650
1330
+ },
1331
+ {
1332
+ "epoch": 1.6072639225181597,
1333
+ "grad_norm": 1.1046490669250488,
1334
+ "learning_rate": 1.329892721796433e-05,
1335
+ "loss": 0.0985,
1336
+ "num_input_tokens_seen": 4132456,
1337
+ "step": 1660
1338
+ },
1339
+ {
1340
+ "epoch": 1.616949152542373,
1341
+ "grad_norm": 1.143494725227356,
1342
+ "learning_rate": 1.314779036421802e-05,
1343
+ "loss": 0.0547,
1344
+ "num_input_tokens_seen": 4156584,
1345
+ "step": 1670
1346
+ },
1347
+ {
1348
+ "epoch": 1.626634382566586,
1349
+ "grad_norm": 2.6082706451416016,
1350
+ "learning_rate": 1.2996844225209033e-05,
1351
+ "loss": 0.0919,
1352
+ "num_input_tokens_seen": 4181448,
1353
+ "step": 1680
1354
+ },
1355
+ {
1356
+ "epoch": 1.636319612590799,
1357
+ "grad_norm": 2.4191458225250244,
1358
+ "learning_rate": 1.2846104343267283e-05,
1359
+ "loss": 0.1204,
1360
+ "num_input_tokens_seen": 4207560,
1361
+ "step": 1690
1362
+ },
1363
+ {
1364
+ "epoch": 1.646004842615012,
1365
+ "grad_norm": 2.051799774169922,
1366
+ "learning_rate": 1.2695586239485223e-05,
1367
+ "loss": 0.0664,
1368
+ "num_input_tokens_seen": 4232040,
1369
+ "step": 1700
1370
+ },
1371
+ {
1372
+ "epoch": 1.6556900726392252,
1373
+ "grad_norm": 1.525844931602478,
1374
+ "learning_rate": 1.254530541211968e-05,
1375
+ "loss": 0.0805,
1376
+ "num_input_tokens_seen": 4257576,
1377
+ "step": 1710
1378
+ },
1379
+ {
1380
+ "epoch": 1.6653753026634384,
1381
+ "grad_norm": 0.9474373459815979,
1382
+ "learning_rate": 1.2395277334996045e-05,
1383
+ "loss": 0.073,
1384
+ "num_input_tokens_seen": 4282472,
1385
+ "step": 1720
1386
+ },
1387
+ {
1388
+ "epoch": 1.6750605326876513,
1389
+ "grad_norm": 1.8932424783706665,
1390
+ "learning_rate": 1.2245517455915036e-05,
1391
+ "loss": 0.0734,
1392
+ "num_input_tokens_seen": 4306792,
1393
+ "step": 1730
1394
+ },
1395
+ {
1396
+ "epoch": 1.6847457627118643,
1397
+ "grad_norm": 1.9888746738433838,
1398
+ "learning_rate": 1.2096041195062051e-05,
1399
+ "loss": 0.0831,
1400
+ "num_input_tokens_seen": 4333384,
1401
+ "step": 1740
1402
+ },
1403
+ {
1404
+ "epoch": 1.6944309927360774,
1405
+ "grad_norm": 1.8355742692947388,
1406
+ "learning_rate": 1.1946863943419452e-05,
1407
+ "loss": 0.0691,
1408
+ "num_input_tokens_seen": 4358344,
1409
+ "step": 1750
1410
+ },
1411
+ {
1412
+ "epoch": 1.7041162227602906,
1413
+ "grad_norm": 2.8447251319885254,
1414
+ "learning_rate": 1.1798001061181799e-05,
1415
+ "loss": 0.0988,
1416
+ "num_input_tokens_seen": 4381768,
1417
+ "step": 1760
1418
+ },
1419
+ {
1420
+ "epoch": 1.7138014527845038,
1421
+ "grad_norm": 2.670257806777954,
1422
+ "learning_rate": 1.1649467876174252e-05,
1423
+ "loss": 0.0936,
1424
+ "num_input_tokens_seen": 4405192,
1425
+ "step": 1770
1426
+ },
1427
+ {
1428
+ "epoch": 1.7234866828087168,
1429
+ "grad_norm": 1.188839077949524,
1430
+ "learning_rate": 1.1501279682274368e-05,
1431
+ "loss": 0.0901,
1432
+ "num_input_tokens_seen": 4430344,
1433
+ "step": 1780
1434
+ },
1435
+ {
1436
+ "epoch": 1.7331719128329297,
1437
+ "grad_norm": 2.494746685028076,
1438
+ "learning_rate": 1.1353451737837312e-05,
1439
+ "loss": 0.0691,
1440
+ "num_input_tokens_seen": 4455336,
1441
+ "step": 1790
1442
+ },
1443
+ {
1444
+ "epoch": 1.7428571428571429,
1445
+ "grad_norm": 1.3223942518234253,
1446
+ "learning_rate": 1.1205999264124788e-05,
1447
+ "loss": 0.0668,
1448
+ "num_input_tokens_seen": 4480648,
1449
+ "step": 1800
1450
+ },
1451
+ {
1452
+ "epoch": 1.752542372881356,
1453
+ "grad_norm": 1.3812003135681152,
1454
+ "learning_rate": 1.105893744373776e-05,
1455
+ "loss": 0.0788,
1456
+ "num_input_tokens_seen": 4506600,
1457
+ "step": 1810
1458
+ },
1459
+ {
1460
+ "epoch": 1.762227602905569,
1461
+ "grad_norm": 0.7805346250534058,
1462
+ "learning_rate": 1.0912281419053139e-05,
1463
+ "loss": 0.0723,
1464
+ "num_input_tokens_seen": 4531368,
1465
+ "step": 1820
1466
+ },
1467
+ {
1468
+ "epoch": 1.771912832929782,
1469
+ "grad_norm": 1.105878472328186,
1470
+ "learning_rate": 1.0766046290664662e-05,
1471
+ "loss": 0.0779,
1472
+ "num_input_tokens_seen": 4555272,
1473
+ "step": 1830
1474
+ },
1475
+ {
1476
+ "epoch": 1.7815980629539951,
1477
+ "grad_norm": 1.8672295808792114,
1478
+ "learning_rate": 1.0620247115828044e-05,
1479
+ "loss": 0.0838,
1480
+ "num_input_tokens_seen": 4580328,
1481
+ "step": 1840
1482
+ },
1483
+ {
1484
+ "epoch": 1.7912832929782083,
1485
+ "grad_norm": 1.844306468963623,
1486
+ "learning_rate": 1.047489890691055e-05,
1487
+ "loss": 0.0594,
1488
+ "num_input_tokens_seen": 4605768,
1489
+ "step": 1850
1490
+ },
1491
+ {
1492
+ "epoch": 1.8009685230024213,
1493
+ "grad_norm": 1.2717005014419556,
1494
+ "learning_rate": 1.0330016629845276e-05,
1495
+ "loss": 0.04,
1496
+ "num_input_tokens_seen": 4631048,
1497
+ "step": 1860
1498
+ },
1499
+ {
1500
+ "epoch": 1.8106537530266342,
1501
+ "grad_norm": 3.5843582153320312,
1502
+ "learning_rate": 1.0185615202590144e-05,
1503
+ "loss": 0.084,
1504
+ "num_input_tokens_seen": 4656456,
1505
+ "step": 1870
1506
+ },
1507
+ {
1508
+ "epoch": 1.8203389830508474,
1509
+ "grad_norm": 4.254288673400879,
1510
+ "learning_rate": 1.004170949359187e-05,
1511
+ "loss": 0.0654,
1512
+ "num_input_tokens_seen": 4681384,
1513
+ "step": 1880
1514
+ },
1515
+ {
1516
+ "epoch": 1.8300242130750606,
1517
+ "grad_norm": 1.351646065711975,
1518
+ "learning_rate": 9.89831432025501e-06,
1519
+ "loss": 0.0712,
1520
+ "num_input_tokens_seen": 4706216,
1521
+ "step": 1890
1522
+ },
1523
+ {
1524
+ "epoch": 1.8397094430992738,
1525
+ "grad_norm": 1.9015384912490845,
1526
+ "learning_rate": 9.755444447416255e-06,
1527
+ "loss": 0.0829,
1528
+ "num_input_tokens_seen": 4730984,
1529
+ "step": 1900
1530
+ },
1531
+ {
1532
+ "epoch": 1.8493946731234867,
1533
+ "grad_norm": 1.3803085088729858,
1534
+ "learning_rate": 9.613114585824196e-06,
1535
+ "loss": 0.0532,
1536
+ "num_input_tokens_seen": 4755112,
1537
+ "step": 1910
1538
+ },
1539
+ {
1540
+ "epoch": 1.8590799031476997,
1541
+ "grad_norm": 6.487275123596191,
1542
+ "learning_rate": 9.471339390624574e-06,
1543
+ "loss": 0.0781,
1544
+ "num_input_tokens_seen": 4780232,
1545
+ "step": 1920
1546
+ },
1547
+ {
1548
+ "epoch": 1.8687651331719128,
1549
+ "grad_norm": 2.182865619659424,
1550
+ "learning_rate": 9.330133459851323e-06,
1551
+ "loss": 0.0908,
1552
+ "num_input_tokens_seen": 4805192,
1553
+ "step": 1930
1554
+ },
1555
+ {
1556
+ "epoch": 1.878450363196126,
1557
+ "grad_norm": 0.42010384798049927,
1558
+ "learning_rate": 9.189511332923463e-06,
1559
+ "loss": 0.0398,
1560
+ "num_input_tokens_seen": 4830856,
1561
+ "step": 1940
1562
+ },
1563
+ {
1564
+ "epoch": 1.888135593220339,
1565
+ "grad_norm": 1.609157919883728,
1566
+ "learning_rate": 9.049487489148008e-06,
1567
+ "loss": 0.0912,
1568
+ "num_input_tokens_seen": 4855656,
1569
+ "step": 1950
1570
+ },
1571
+ {
1572
+ "epoch": 1.897820823244552,
1573
+ "grad_norm": 2.4291250705718994,
1574
+ "learning_rate": 8.910076346229134e-06,
1575
+ "loss": 0.0746,
1576
+ "num_input_tokens_seen": 4880392,
1577
+ "step": 1960
1578
+ },
1579
+ {
1580
+ "epoch": 1.907506053268765,
1581
+ "grad_norm": 2.243717670440674,
1582
+ "learning_rate": 8.77129225878361e-06,
1583
+ "loss": 0.1066,
1584
+ "num_input_tokens_seen": 4905320,
1585
+ "step": 1970
1586
+ },
1587
+ {
1588
+ "epoch": 1.9171912832929783,
1589
+ "grad_norm": 2.145559072494507,
1590
+ "learning_rate": 8.633149516862777e-06,
1591
+ "loss": 0.0839,
1592
+ "num_input_tokens_seen": 4930536,
1593
+ "step": 1980
1594
+ },
1595
+ {
1596
+ "epoch": 1.9268765133171912,
1597
+ "grad_norm": 0.6746326088905334,
1598
+ "learning_rate": 8.495662344481135e-06,
1599
+ "loss": 0.0527,
1600
+ "num_input_tokens_seen": 4956168,
1601
+ "step": 1990
1602
+ },
1603
+ {
1604
+ "epoch": 1.9365617433414044,
1605
+ "grad_norm": 1.293521761894226,
1606
+ "learning_rate": 8.358844898151791e-06,
1607
+ "loss": 0.1033,
1608
+ "num_input_tokens_seen": 4980584,
1609
+ "step": 2000
1610
+ },
1611
+ {
1612
+ "epoch": 1.9462469733656174,
1613
+ "grad_norm": 1.7922570705413818,
1614
+ "learning_rate": 8.222711265428779e-06,
1615
+ "loss": 0.079,
1616
+ "num_input_tokens_seen": 5005992,
1617
+ "step": 2010
1618
+ },
1619
+ {
1620
+ "epoch": 1.9559322033898305,
1621
+ "grad_norm": 1.0770626068115234,
1622
+ "learning_rate": 8.087275463456548e-06,
1623
+ "loss": 0.0652,
1624
+ "num_input_tokens_seen": 5032168,
1625
+ "step": 2020
1626
+ },
1627
+ {
1628
+ "epoch": 1.9656174334140437,
1629
+ "grad_norm": 0.7968271374702454,
1630
+ "learning_rate": 7.952551437526648e-06,
1631
+ "loss": 0.0593,
1632
+ "num_input_tokens_seen": 5056296,
1633
+ "step": 2030
1634
+ },
1635
+ {
1636
+ "epoch": 1.9753026634382567,
1637
+ "grad_norm": 2.140667676925659,
1638
+ "learning_rate": 7.818553059641868e-06,
1639
+ "loss": 0.0933,
1640
+ "num_input_tokens_seen": 5080424,
1641
+ "step": 2040
1642
+ },
1643
+ {
1644
+ "epoch": 1.9849878934624696,
1645
+ "grad_norm": 2.905066967010498,
1646
+ "learning_rate": 7.685294127087852e-06,
1647
+ "loss": 0.059,
1648
+ "num_input_tokens_seen": 5104904,
1649
+ "step": 2050
1650
+ },
1651
+ {
1652
+ "epoch": 1.9946731234866828,
1653
+ "grad_norm": 2.5095653533935547,
1654
+ "learning_rate": 7.552788361012486e-06,
1655
+ "loss": 0.0766,
1656
+ "num_input_tokens_seen": 5129064,
1657
+ "step": 2060
1658
+ },
1659
+ {
1660
+ "epoch": 2.0038740920096854,
1661
+ "grad_norm": 1.0241445302963257,
1662
+ "learning_rate": 7.421049405013061e-06,
1663
+ "loss": 0.0637,
1664
+ "num_input_tokens_seen": 5152120,
1665
+ "step": 2070
1666
+ },
1667
+ {
1668
+ "epoch": 2.013559322033898,
1669
+ "grad_norm": 1.7620762586593628,
1670
+ "learning_rate": 7.290090823731452e-06,
1671
+ "loss": 0.0419,
1672
+ "num_input_tokens_seen": 5176728,
1673
+ "step": 2080
1674
+ },
1675
+ {
1676
+ "epoch": 2.0232445520581113,
1677
+ "grad_norm": 1.1471503973007202,
1678
+ "learning_rate": 7.159926101457423e-06,
1679
+ "loss": 0.0586,
1680
+ "num_input_tokens_seen": 5201176,
1681
+ "step": 2090
1682
+ },
1683
+ {
1684
+ "epoch": 2.0329297820823244,
1685
+ "grad_norm": 1.4868978261947632,
1686
+ "learning_rate": 7.030568640740202e-06,
1687
+ "loss": 0.0382,
1688
+ "num_input_tokens_seen": 5225368,
1689
+ "step": 2100
1690
+ },
1691
+ {
1692
+ "epoch": 2.0426150121065376,
1693
+ "grad_norm": 0.8362380266189575,
1694
+ "learning_rate": 6.902031761008456e-06,
1695
+ "loss": 0.0597,
1696
+ "num_input_tokens_seen": 5250136,
1697
+ "step": 2110
1698
+ },
1699
+ {
1700
+ "epoch": 2.052300242130751,
1701
+ "grad_norm": 2.6067404747009277,
1702
+ "learning_rate": 6.774328697198879e-06,
1703
+ "loss": 0.0367,
1704
+ "num_input_tokens_seen": 5274264,
1705
+ "step": 2120
1706
+ },
1707
+ {
1708
+ "epoch": 2.0619854721549635,
1709
+ "grad_norm": 1.6327483654022217,
1710
+ "learning_rate": 6.647472598393399e-06,
1711
+ "loss": 0.04,
1712
+ "num_input_tokens_seen": 5298264,
1713
+ "step": 2130
1714
+ },
1715
+ {
1716
+ "epoch": 2.0716707021791767,
1717
+ "grad_norm": 1.461899995803833,
1718
+ "learning_rate": 6.521476526465309e-06,
1719
+ "loss": 0.0426,
1720
+ "num_input_tokens_seen": 5322872,
1721
+ "step": 2140
1722
+ },
1723
+ {
1724
+ "epoch": 2.08135593220339,
1725
+ "grad_norm": 2.3133087158203125,
1726
+ "learning_rate": 6.3963534547343126e-06,
1727
+ "loss": 0.0706,
1728
+ "num_input_tokens_seen": 5348120,
1729
+ "step": 2150
1730
+ },
1731
+ {
1732
+ "epoch": 2.091041162227603,
1733
+ "grad_norm": 3.1375937461853027,
1734
+ "learning_rate": 6.27211626663071e-06,
1735
+ "loss": 0.0377,
1736
+ "num_input_tokens_seen": 5373240,
1737
+ "step": 2160
1738
+ },
1739
+ {
1740
+ "epoch": 2.100726392251816,
1741
+ "grad_norm": 2.147362470626831,
1742
+ "learning_rate": 6.148777754368862e-06,
1743
+ "loss": 0.0608,
1744
+ "num_input_tokens_seen": 5398296,
1745
+ "step": 2170
1746
+ },
1747
+ {
1748
+ "epoch": 2.110411622276029,
1749
+ "grad_norm": 0.6415455341339111,
1750
+ "learning_rate": 6.026350617630011e-06,
1751
+ "loss": 0.0334,
1752
+ "num_input_tokens_seen": 5424408,
1753
+ "step": 2180
1754
+ },
1755
+ {
1756
+ "epoch": 2.120096852300242,
1757
+ "grad_norm": 3.5363268852233887,
1758
+ "learning_rate": 5.904847462254646e-06,
1759
+ "loss": 0.0445,
1760
+ "num_input_tokens_seen": 5449880,
1761
+ "step": 2190
1762
+ },
1763
+ {
1764
+ "epoch": 2.1297820823244553,
1765
+ "grad_norm": 2.8637278079986572,
1766
+ "learning_rate": 5.784280798944537e-06,
1767
+ "loss": 0.0735,
1768
+ "num_input_tokens_seen": 5474808,
1769
+ "step": 2200
1770
+ },
1771
+ {
1772
+ "epoch": 2.1394673123486685,
1773
+ "grad_norm": 1.1030181646347046,
1774
+ "learning_rate": 5.6646630419745404e-06,
1775
+ "loss": 0.056,
1776
+ "num_input_tokens_seen": 5499672,
1777
+ "step": 2210
1778
+ },
1779
+ {
1780
+ "epoch": 2.1491525423728812,
1781
+ "grad_norm": 1.6034140586853027,
1782
+ "learning_rate": 5.5460065079143694e-06,
1783
+ "loss": 0.0703,
1784
+ "num_input_tokens_seen": 5523672,
1785
+ "step": 2220
1786
+ },
1787
+ {
1788
+ "epoch": 2.1588377723970944,
1789
+ "grad_norm": 4.010861396789551,
1790
+ "learning_rate": 5.428323414360401e-06,
1791
+ "loss": 0.0504,
1792
+ "num_input_tokens_seen": 5548664,
1793
+ "step": 2230
1794
+ },
1795
+ {
1796
+ "epoch": 2.1685230024213076,
1797
+ "grad_norm": 2.1378917694091797,
1798
+ "learning_rate": 5.311625878677658e-06,
1799
+ "loss": 0.0398,
1800
+ "num_input_tokens_seen": 5573944,
1801
+ "step": 2240
1802
+ },
1803
+ {
1804
+ "epoch": 2.1782082324455208,
1805
+ "grad_norm": 1.6304939985275269,
1806
+ "learning_rate": 5.195925916752166e-06,
1807
+ "loss": 0.045,
1808
+ "num_input_tokens_seen": 5599224,
1809
+ "step": 2250
1810
+ },
1811
+ {
1812
+ "epoch": 2.1878934624697335,
1813
+ "grad_norm": 1.6586905717849731,
1814
+ "learning_rate": 5.081235441753685e-06,
1815
+ "loss": 0.0483,
1816
+ "num_input_tokens_seen": 5623864,
1817
+ "step": 2260
1818
+ },
1819
+ {
1820
+ "epoch": 2.1975786924939467,
1821
+ "grad_norm": 2.3342106342315674,
1822
+ "learning_rate": 4.9675662629091055e-06,
1823
+ "loss": 0.0476,
1824
+ "num_input_tokens_seen": 5648760,
1825
+ "step": 2270
1826
+ },
1827
+ {
1828
+ "epoch": 2.20726392251816,
1829
+ "grad_norm": 1.122441291809082,
1830
+ "learning_rate": 4.854930084286458e-06,
1831
+ "loss": 0.0537,
1832
+ "num_input_tokens_seen": 5673720,
1833
+ "step": 2280
1834
+ },
1835
+ {
1836
+ "epoch": 2.216949152542373,
1837
+ "grad_norm": 0.22967131435871124,
1838
+ "learning_rate": 4.743338503589796e-06,
1839
+ "loss": 0.0567,
1840
+ "num_input_tokens_seen": 5697784,
1841
+ "step": 2290
1842
+ },
1843
+ {
1844
+ "epoch": 2.226634382566586,
1845
+ "grad_norm": 3.79902720451355,
1846
+ "learning_rate": 4.632803010965056e-06,
1847
+ "loss": 0.0502,
1848
+ "num_input_tokens_seen": 5722040,
1849
+ "step": 2300
1850
+ },
1851
+ {
1852
+ "epoch": 2.236319612590799,
1853
+ "grad_norm": 0.5887905359268188,
1854
+ "learning_rate": 4.523334987816917e-06,
1855
+ "loss": 0.0444,
1856
+ "num_input_tokens_seen": 5747672,
1857
+ "step": 2310
1858
+ },
1859
+ {
1860
+ "epoch": 2.246004842615012,
1861
+ "grad_norm": 1.776781678199768,
1862
+ "learning_rate": 4.414945705636949e-06,
1863
+ "loss": 0.0482,
1864
+ "num_input_tokens_seen": 5772056,
1865
+ "step": 2320
1866
+ },
1867
+ {
1868
+ "epoch": 2.2556900726392253,
1869
+ "grad_norm": 2.457751512527466,
1870
+ "learning_rate": 4.307646324843004e-06,
1871
+ "loss": 0.0398,
1872
+ "num_input_tokens_seen": 5796728,
1873
+ "step": 2330
1874
+ },
1875
+ {
1876
+ "epoch": 2.2653753026634385,
1877
+ "grad_norm": 1.8455132246017456,
1878
+ "learning_rate": 4.201447893630065e-06,
1879
+ "loss": 0.0268,
1880
+ "num_input_tokens_seen": 5822520,
1881
+ "step": 2340
1882
+ },
1883
+ {
1884
+ "epoch": 2.275060532687651,
1885
+ "grad_norm": 3.7571520805358887,
1886
+ "learning_rate": 4.096361346832681e-06,
1887
+ "loss": 0.0427,
1888
+ "num_input_tokens_seen": 5847768,
1889
+ "step": 2350
1890
+ },
1891
+ {
1892
+ "epoch": 2.2847457627118644,
1893
+ "grad_norm": 4.052141189575195,
1894
+ "learning_rate": 3.992397504799039e-06,
1895
+ "loss": 0.0363,
1896
+ "num_input_tokens_seen": 5873208,
1897
+ "step": 2360
1898
+ },
1899
+ {
1900
+ "epoch": 2.2944309927360775,
1901
+ "grad_norm": 2.814667224884033,
1902
+ "learning_rate": 3.889567072276827e-06,
1903
+ "loss": 0.0432,
1904
+ "num_input_tokens_seen": 5897368,
1905
+ "step": 2370
1906
+ },
1907
+ {
1908
+ "epoch": 2.3041162227602907,
1909
+ "grad_norm": 0.680135190486908,
1910
+ "learning_rate": 3.78788063731103e-06,
1911
+ "loss": 0.0662,
1912
+ "num_input_tokens_seen": 5921656,
1913
+ "step": 2380
1914
+ },
1915
+ {
1916
+ "epoch": 2.3138014527845034,
1917
+ "grad_norm": 4.201208591461182,
1918
+ "learning_rate": 3.6873486701536814e-06,
1919
+ "loss": 0.0434,
1920
+ "num_input_tokens_seen": 5946328,
1921
+ "step": 2390
1922
+ },
1923
+ {
1924
+ "epoch": 2.3234866828087166,
1925
+ "grad_norm": 1.828552007675171,
1926
+ "learning_rate": 3.587981522185829e-06,
1927
+ "loss": 0.0425,
1928
+ "num_input_tokens_seen": 5971352,
1929
+ "step": 2400
1930
+ },
1931
+ {
1932
+ "epoch": 2.33317191283293,
1933
+ "grad_norm": 0.6704538464546204,
1934
+ "learning_rate": 3.4897894248516736e-06,
1935
+ "loss": 0.0533,
1936
+ "num_input_tokens_seen": 5995544,
1937
+ "step": 2410
1938
+ },
1939
+ {
1940
+ "epoch": 2.342857142857143,
1941
+ "grad_norm": 2.377774238586426,
1942
+ "learning_rate": 3.3927824886050555e-06,
1943
+ "loss": 0.0499,
1944
+ "num_input_tokens_seen": 6020600,
1945
+ "step": 2420
1946
+ },
1947
+ {
1948
+ "epoch": 2.3525423728813557,
1949
+ "grad_norm": 0.2766050398349762,
1950
+ "learning_rate": 3.2969707018684657e-06,
1951
+ "loss": 0.021,
1952
+ "num_input_tokens_seen": 6045304,
1953
+ "step": 2430
1954
+ },
1955
+ {
1956
+ "epoch": 2.362227602905569,
1957
+ "grad_norm": 1.9754971265792847,
1958
+ "learning_rate": 3.202363930004536e-06,
1959
+ "loss": 0.0216,
1960
+ "num_input_tokens_seen": 6070776,
1961
+ "step": 2440
1962
+ },
1963
+ {
1964
+ "epoch": 2.371912832929782,
1965
+ "grad_norm": 6.165454387664795,
1966
+ "learning_rate": 3.1089719143002615e-06,
1967
+ "loss": 0.0431,
1968
+ "num_input_tokens_seen": 6095256,
1969
+ "step": 2450
1970
+ },
1971
+ {
1972
+ "epoch": 2.3815980629539952,
1973
+ "grad_norm": 2.579355001449585,
1974
+ "learning_rate": 3.016804270963994e-06,
1975
+ "loss": 0.0515,
1976
+ "num_input_tokens_seen": 6120088,
1977
+ "step": 2460
1978
+ },
1979
+ {
1980
+ "epoch": 2.3912832929782084,
1981
+ "grad_norm": 1.1952487230300903,
1982
+ "learning_rate": 2.925870490135255e-06,
1983
+ "loss": 0.0349,
1984
+ "num_input_tokens_seen": 6144792,
1985
+ "step": 2470
1986
+ },
1987
+ {
1988
+ "epoch": 2.400968523002421,
1989
+ "grad_norm": 0.08051615208387375,
1990
+ "learning_rate": 2.8361799349076143e-06,
1991
+ "loss": 0.0251,
1992
+ "num_input_tokens_seen": 6169688,
1993
+ "step": 2480
1994
+ },
1995
+ {
1996
+ "epoch": 2.4106537530266343,
1997
+ "grad_norm": 3.1085357666015625,
1998
+ "learning_rate": 2.747741840364593e-06,
1999
+ "loss": 0.0634,
2000
+ "num_input_tokens_seen": 6194680,
2001
+ "step": 2490
2002
+ },
2003
+ {
2004
+ "epoch": 2.4203389830508475,
2005
+ "grad_norm": 1.2273328304290771,
2006
+ "learning_rate": 2.6605653126287555e-06,
2007
+ "loss": 0.0451,
2008
+ "num_input_tokens_seen": 6218712,
2009
+ "step": 2500
2010
+ },
2011
+ {
2012
+ "epoch": 2.4300242130750607,
2013
+ "grad_norm": 2.9415712356567383,
2014
+ "learning_rate": 2.5746593279241105e-06,
2015
+ "loss": 0.0395,
2016
+ "num_input_tokens_seen": 6243384,
2017
+ "step": 2510
2018
+ },
2019
+ {
2020
+ "epoch": 2.4397094430992734,
2021
+ "grad_norm": 0.24813522398471832,
2022
+ "learning_rate": 2.490032731651833e-06,
2023
+ "loss": 0.0537,
2024
+ "num_input_tokens_seen": 6267416,
2025
+ "step": 2520
2026
+ },
2027
+ {
2028
+ "epoch": 2.4493946731234866,
2029
+ "grad_norm": 1.5883897542953491,
2030
+ "learning_rate": 2.4066942374795205e-06,
2031
+ "loss": 0.0402,
2032
+ "num_input_tokens_seen": 6292696,
2033
+ "step": 2530
2034
+ },
2035
+ {
2036
+ "epoch": 2.4590799031476998,
2037
+ "grad_norm": 0.41333088278770447,
2038
+ "learning_rate": 2.324652426443962e-06,
2039
+ "loss": 0.0295,
2040
+ "num_input_tokens_seen": 6317208,
2041
+ "step": 2540
2042
+ },
2043
+ {
2044
+ "epoch": 2.468765133171913,
2045
+ "grad_norm": 3.1688761711120605,
2046
+ "learning_rate": 2.243915746067587e-06,
2047
+ "loss": 0.0515,
2048
+ "num_input_tokens_seen": 6341688,
2049
+ "step": 2550
2050
+ },
2051
+ {
2052
+ "epoch": 2.478450363196126,
2053
+ "grad_norm": 0.7070954442024231,
2054
+ "learning_rate": 2.164492509488657e-06,
2055
+ "loss": 0.0443,
2056
+ "num_input_tokens_seen": 6366712,
2057
+ "step": 2560
2058
+ },
2059
+ {
2060
+ "epoch": 2.488135593220339,
2061
+ "grad_norm": 0.3987884819507599,
2062
+ "learning_rate": 2.086390894605288e-06,
2063
+ "loss": 0.0555,
2064
+ "num_input_tokens_seen": 6391256,
2065
+ "step": 2570
2066
+ },
2067
+ {
2068
+ "epoch": 2.497820823244552,
2069
+ "grad_norm": 1.7903181314468384,
2070
+ "learning_rate": 2.0096189432334194e-06,
2071
+ "loss": 0.054,
2072
+ "num_input_tokens_seen": 6416184,
2073
+ "step": 2580
2074
+ },
2075
+ {
2076
+ "epoch": 2.507506053268765,
2077
+ "grad_norm": 7.973659992218018,
2078
+ "learning_rate": 1.9341845602787733e-06,
2079
+ "loss": 0.075,
2080
+ "num_input_tokens_seen": 6441176,
2081
+ "step": 2590
2082
+ },
2083
+ {
2084
+ "epoch": 2.5171912832929784,
2085
+ "grad_norm": 2.1646482944488525,
2086
+ "learning_rate": 1.8600955129229009e-06,
2087
+ "loss": 0.0384,
2088
+ "num_input_tokens_seen": 6465688,
2089
+ "step": 2600
2090
+ },
2091
+ {
2092
+ "epoch": 2.526876513317191,
2093
+ "grad_norm": 0.9478936791419983,
2094
+ "learning_rate": 1.7873594298234557e-06,
2095
+ "loss": 0.038,
2096
+ "num_input_tokens_seen": 6490456,
2097
+ "step": 2610
2098
+ },
2099
+ {
2100
+ "epoch": 2.5365617433414043,
2101
+ "grad_norm": 0.5018621683120728,
2102
+ "learning_rate": 1.7159838003286848e-06,
2103
+ "loss": 0.0233,
2104
+ "num_input_tokens_seen": 6515704,
2105
+ "step": 2620
2106
+ },
2107
+ {
2108
+ "epoch": 2.5462469733656174,
2109
+ "grad_norm": 4.254843711853027,
2110
+ "learning_rate": 1.645975973706269e-06,
2111
+ "loss": 0.0634,
2112
+ "num_input_tokens_seen": 6540920,
2113
+ "step": 2630
2114
+ },
2115
+ {
2116
+ "epoch": 2.5559322033898306,
2117
+ "grad_norm": 0.3339782655239105,
2118
+ "learning_rate": 1.5773431583866227e-06,
2119
+ "loss": 0.0333,
2120
+ "num_input_tokens_seen": 6565880,
2121
+ "step": 2640
2122
+ },
2123
+ {
2124
+ "epoch": 2.565617433414044,
2125
+ "grad_norm": 2.9373421669006348,
2126
+ "learning_rate": 1.5100924212206534e-06,
2127
+ "loss": 0.0649,
2128
+ "num_input_tokens_seen": 6591000,
2129
+ "step": 2650
2130
+ },
2131
+ {
2132
+ "epoch": 2.5753026634382565,
2133
+ "grad_norm": 1.637086033821106,
2134
+ "learning_rate": 1.44423068675212e-06,
2135
+ "loss": 0.0531,
2136
+ "num_input_tokens_seen": 6615800,
2137
+ "step": 2660
2138
+ },
2139
+ {
2140
+ "epoch": 2.5849878934624697,
2141
+ "grad_norm": 0.06637797504663467,
2142
+ "learning_rate": 1.3797647365046411e-06,
2143
+ "loss": 0.0426,
2144
+ "num_input_tokens_seen": 6639288,
2145
+ "step": 2670
2146
+ },
2147
+ {
2148
+ "epoch": 2.594673123486683,
2149
+ "grad_norm": 0.9268229603767395,
2150
+ "learning_rate": 1.3167012082834212e-06,
2151
+ "loss": 0.0368,
2152
+ "num_input_tokens_seen": 6664632,
2153
+ "step": 2680
2154
+ },
2155
+ {
2156
+ "epoch": 2.6043583535108956,
2157
+ "grad_norm": 4.011239528656006,
2158
+ "learning_rate": 1.2550465954917932e-06,
2159
+ "loss": 0.0165,
2160
+ "num_input_tokens_seen": 6689496,
2161
+ "step": 2690
2162
+ },
2163
+ {
2164
+ "epoch": 2.614043583535109,
2165
+ "grad_norm": 3.382112741470337,
2166
+ "learning_rate": 1.1948072464626102e-06,
2167
+ "loss": 0.0331,
2168
+ "num_input_tokens_seen": 6714552,
2169
+ "step": 2700
2170
+ },
2171
+ {
2172
+ "epoch": 2.623728813559322,
2173
+ "grad_norm": 5.245890140533447,
2174
+ "learning_rate": 1.1359893638045854e-06,
2175
+ "loss": 0.0226,
2176
+ "num_input_tokens_seen": 6739320,
2177
+ "step": 2710
2178
+ },
2179
+ {
2180
+ "epoch": 2.633414043583535,
2181
+ "grad_norm": 2.0806005001068115,
2182
+ "learning_rate": 1.0785990037636335e-06,
2183
+ "loss": 0.0611,
2184
+ "num_input_tokens_seen": 6763352,
2185
+ "step": 2720
2186
+ },
2187
+ {
2188
+ "epoch": 2.6430992736077483,
2189
+ "grad_norm": 2.040339469909668,
2190
+ "learning_rate": 1.022642075599286e-06,
2191
+ "loss": 0.0615,
2192
+ "num_input_tokens_seen": 6787544,
2193
+ "step": 2730
2194
+ },
2195
+ {
2196
+ "epoch": 2.6527845036319615,
2197
+ "grad_norm": 4.939095973968506,
2198
+ "learning_rate": 9.68124340976232e-07,
2199
+ "loss": 0.0393,
2200
+ "num_input_tokens_seen": 6812760,
2201
+ "step": 2740
2202
+ },
2203
+ {
2204
+ "epoch": 2.6624697336561742,
2205
+ "grad_norm": 0.7793028354644775,
2206
+ "learning_rate": 9.150514133710647e-07,
2207
+ "loss": 0.0656,
2208
+ "num_input_tokens_seen": 6838008,
2209
+ "step": 2750
2210
+ },
2211
+ {
2212
+ "epoch": 2.6721549636803874,
2213
+ "grad_norm": 0.568551778793335,
2214
+ "learning_rate": 8.634287574942834e-07,
2215
+ "loss": 0.0452,
2216
+ "num_input_tokens_seen": 6863320,
2217
+ "step": 2760
2218
+ },
2219
+ {
2220
+ "epoch": 2.6818401937046006,
2221
+ "grad_norm": 5.33021354675293,
2222
+ "learning_rate": 8.132616887276212e-07,
2223
+ "loss": 0.0404,
2224
+ "num_input_tokens_seen": 6888824,
2225
+ "step": 2770
2226
+ },
2227
+ {
2228
+ "epoch": 2.6915254237288133,
2229
+ "grad_norm": 4.118853569030762,
2230
+ "learning_rate": 7.645553725767229e-07,
2231
+ "loss": 0.0543,
2232
+ "num_input_tokens_seen": 6913048,
2233
+ "step": 2780
2234
+ },
2235
+ {
2236
+ "epoch": 2.7012106537530265,
2237
+ "grad_norm": 1.218005895614624,
2238
+ "learning_rate": 7.173148241392957e-07,
2239
+ "loss": 0.0459,
2240
+ "num_input_tokens_seen": 6937432,
2241
+ "step": 2790
2242
+ },
2243
+ {
2244
+ "epoch": 2.7108958837772397,
2245
+ "grad_norm": 0.6871452927589417,
2246
+ "learning_rate": 6.71544907588712e-07,
2247
+ "loss": 0.0386,
2248
+ "num_input_tokens_seen": 6962584,
2249
+ "step": 2800
2250
+ },
2251
+ {
2252
+ "epoch": 2.720581113801453,
2253
+ "grad_norm": 2.3115310668945312,
2254
+ "learning_rate": 6.272503356731601e-07,
2255
+ "loss": 0.0714,
2256
+ "num_input_tokens_seen": 6987768,
2257
+ "step": 2810
2258
+ },
2259
+ {
2260
+ "epoch": 2.730266343825666,
2261
+ "grad_norm": 4.2863569259643555,
2262
+ "learning_rate": 5.84435669230401e-07,
2263
+ "loss": 0.0364,
2264
+ "num_input_tokens_seen": 7013336,
2265
+ "step": 2820
2266
+ },
2267
+ {
2268
+ "epoch": 2.739951573849879,
2269
+ "grad_norm": 0.879754900932312,
2270
+ "learning_rate": 5.431053167181515e-07,
2271
+ "loss": 0.0346,
2272
+ "num_input_tokens_seen": 7038648,
2273
+ "step": 2830
2274
+ },
2275
+ {
2276
+ "epoch": 2.749636803874092,
2277
+ "grad_norm": 1.9641544818878174,
2278
+ "learning_rate": 5.032635337601687e-07,
2279
+ "loss": 0.0337,
2280
+ "num_input_tokens_seen": 7064184,
2281
+ "step": 2840
2282
+ },
2283
+ {
2284
+ "epoch": 2.759322033898305,
2285
+ "grad_norm": 0.6523151993751526,
2286
+ "learning_rate": 4.6491442270805596e-07,
2287
+ "loss": 0.0229,
2288
+ "num_input_tokens_seen": 7089336,
2289
+ "step": 2850
2290
+ },
2291
+ {
2292
+ "epoch": 2.7690072639225183,
2293
+ "grad_norm": 0.46984636783599854,
2294
+ "learning_rate": 4.280619322188628e-07,
2295
+ "loss": 0.0472,
2296
+ "num_input_tokens_seen": 7114072,
2297
+ "step": 2860
2298
+ },
2299
+ {
2300
+ "epoch": 2.778692493946731,
2301
+ "grad_norm": 2.178297519683838,
2302
+ "learning_rate": 3.9270985684851545e-07,
2303
+ "loss": 0.0498,
2304
+ "num_input_tokens_seen": 7139576,
2305
+ "step": 2870
2306
+ },
2307
+ {
2308
+ "epoch": 2.788377723970944,
2309
+ "grad_norm": 3.751574993133545,
2310
+ "learning_rate": 3.588618366610941e-07,
2311
+ "loss": 0.0442,
2312
+ "num_input_tokens_seen": 7165432,
2313
+ "step": 2880
2314
+ },
2315
+ {
2316
+ "epoch": 2.7980629539951574,
2317
+ "grad_norm": 1.0459034442901611,
2318
+ "learning_rate": 3.2652135685403593e-07,
2319
+ "loss": 0.0324,
2320
+ "num_input_tokens_seen": 7190808,
2321
+ "step": 2890
2322
+ },
2323
+ {
2324
+ "epoch": 2.8077481840193705,
2325
+ "grad_norm": 3.6684751510620117,
2326
+ "learning_rate": 2.9569174739928096e-07,
2327
+ "loss": 0.0497,
2328
+ "num_input_tokens_seen": 7216440,
2329
+ "step": 2900
2330
+ },
2331
+ {
2332
+ "epoch": 2.8174334140435837,
2333
+ "grad_norm": 4.388014316558838,
2334
+ "learning_rate": 2.663761827003941e-07,
2335
+ "loss": 0.0404,
2336
+ "num_input_tokens_seen": 7243480,
2337
+ "step": 2910
2338
+ },
2339
+ {
2340
+ "epoch": 2.8271186440677964,
2341
+ "grad_norm": 6.251937389373779,
2342
+ "learning_rate": 2.38577681265707e-07,
2343
+ "loss": 0.0479,
2344
+ "num_input_tokens_seen": 7268568,
2345
+ "step": 2920
2346
+ },
2347
+ {
2348
+ "epoch": 2.8368038740920096,
2349
+ "grad_norm": 2.676504611968994,
2350
+ "learning_rate": 2.122991053975215e-07,
2351
+ "loss": 0.0378,
2352
+ "num_input_tokens_seen": 7293784,
2353
+ "step": 2930
2354
+ },
2355
+ {
2356
+ "epoch": 2.846489104116223,
2357
+ "grad_norm": 4.877316474914551,
2358
+ "learning_rate": 1.8754316089737878e-07,
2359
+ "loss": 0.0328,
2360
+ "num_input_tokens_seen": 7318680,
2361
+ "step": 2940
2362
+ },
2363
+ {
2364
+ "epoch": 2.856174334140436,
2365
+ "grad_norm": 1.454691767692566,
2366
+ "learning_rate": 1.6431239678746546e-07,
2367
+ "loss": 0.0411,
2368
+ "num_input_tokens_seen": 7343864,
2369
+ "step": 2950
2370
+ },
2371
+ {
2372
+ "epoch": 2.8658595641646487,
2373
+ "grad_norm": 3.7415764331817627,
2374
+ "learning_rate": 1.4260920504814366e-07,
2375
+ "loss": 0.0649,
2376
+ "num_input_tokens_seen": 7370232,
2377
+ "step": 2960
2378
+ },
2379
+ {
2380
+ "epoch": 2.875544794188862,
2381
+ "grad_norm": 2.577986240386963,
2382
+ "learning_rate": 1.22435820371658e-07,
2383
+ "loss": 0.0462,
2384
+ "num_input_tokens_seen": 7394936,
2385
+ "step": 2970
2386
+ },
2387
+ {
2388
+ "epoch": 2.885230024213075,
2389
+ "grad_norm": 4.861838340759277,
2390
+ "learning_rate": 1.0379431993204458e-07,
2391
+ "loss": 0.0425,
2392
+ "num_input_tokens_seen": 7420088,
2393
+ "step": 2980
2394
+ },
2395
+ {
2396
+ "epoch": 2.8949152542372882,
2397
+ "grad_norm": 3.2706315517425537,
2398
+ "learning_rate": 8.668662317124043e-08,
2399
+ "loss": 0.0418,
2400
+ "num_input_tokens_seen": 7445048,
2401
+ "step": 2990
2402
+ },
2403
+ {
2404
+ "epoch": 2.9046004842615014,
2405
+ "grad_norm": 0.6351612210273743,
2406
+ "learning_rate": 7.111449160146333e-08,
2407
+ "loss": 0.022,
2408
+ "num_input_tokens_seen": 7469144,
2409
+ "step": 3000
2410
+ }
2411
+ ],
2412
+ "logging_steps": 10,
2413
+ "max_steps": 3096,
2414
+ "num_input_tokens_seen": 7469144,
2415
+ "num_train_epochs": 3,
2416
+ "save_steps": 1000,
2417
+ "stateful_callbacks": {
2418
+ "TrainerControl": {
2419
+ "args": {
2420
+ "should_epoch_stop": false,
2421
+ "should_evaluate": false,
2422
+ "should_log": false,
2423
+ "should_save": true,
2424
+ "should_training_stop": false
2425
+ },
2426
+ "attributes": {}
2427
+ }
2428
+ },
2429
+ "total_flos": 3.197440763416412e+17,
2430
+ "train_batch_size": 4,
2431
+ "trial_name": null,
2432
+ "trial_params": null
2433
+ }
checkpoint-3000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de7a6d9a1a05e78971782a6ee6bb88dfb9617ab9f9e2f35984cd80b0711875f6
3
+ size 5688
checkpoint-3096/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: unsloth/mistral-7b-instruct-v0.3-bnb-4bit
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.12.0
checkpoint-3096/adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 8,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "up_proj",
24
+ "gate_proj",
25
+ "q_proj",
26
+ "down_proj",
27
+ "v_proj",
28
+ "k_proj",
29
+ "o_proj"
30
+ ],
31
+ "task_type": "CAUSAL_LM",
32
+ "use_dora": false,
33
+ "use_rslora": false
34
+ }
checkpoint-3096/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e945a3ed8fb6a8e001cf93765506fa4f4e8a5f26f7e25358e3647e1e889d3d2
3
+ size 83945296
checkpoint-3096/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d6e06e3ea474b930e3eb1a1eda05de3340e31330c04a77f3de0619d0b65e354
3
+ size 168149074
checkpoint-3096/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9196a1e708bf24d6abba41cce3f8558820acc3e50f9394c5955e29eb41ffea3d
3
+ size 14244
checkpoint-3096/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:78c499ee35917a6999d5080692bb0d882ad4d51ced03b11970197ebcff165fb6
3
+ size 1064
checkpoint-3096/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
checkpoint-3096/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-3096/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37f00374dea48658ee8f5d0f21895b9bc55cb0103939607c8185bfd1c6ca1f89
3
+ size 587404
checkpoint-3096/tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff