beamaia commited on
Commit
de5bdfb
·
verified ·
1 Parent(s): 231bf01

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. README.md +87 -5
  2. adapter_config.json +3 -3
  3. checkpoint-1520/README.md +204 -0
  4. checkpoint-1520/adapter_config.json +29 -0
  5. checkpoint-1520/adapter_model.safetensors +3 -0
  6. checkpoint-1520/optimizer.pt +3 -0
  7. checkpoint-1520/rng_state_0.pth +3 -0
  8. checkpoint-1520/rng_state_1.pth +3 -0
  9. checkpoint-1520/scheduler.pt +3 -0
  10. checkpoint-1520/special_tokens_map.json +24 -0
  11. checkpoint-1520/tokenizer.json +0 -0
  12. checkpoint-1520/tokenizer.model +3 -0
  13. checkpoint-1520/tokenizer_config.json +43 -0
  14. checkpoint-1520/trainer_state.json +1161 -0
  15. checkpoint-1520/training_args.bin +3 -0
  16. checkpoint-1540/README.md +204 -0
  17. checkpoint-1540/adapter_config.json +29 -0
  18. checkpoint-1540/adapter_model.safetensors +3 -0
  19. checkpoint-1540/optimizer.pt +3 -0
  20. checkpoint-1540/rng_state_0.pth +3 -0
  21. checkpoint-1540/rng_state_1.pth +3 -0
  22. checkpoint-1540/scheduler.pt +3 -0
  23. checkpoint-1540/special_tokens_map.json +24 -0
  24. checkpoint-1540/tokenizer.json +0 -0
  25. checkpoint-1540/tokenizer.model +3 -0
  26. checkpoint-1540/tokenizer_config.json +43 -0
  27. checkpoint-1540/trainer_state.json +1176 -0
  28. checkpoint-1540/training_args.bin +3 -0
  29. checkpoint-1560/README.md +204 -0
  30. checkpoint-1560/adapter_config.json +29 -0
  31. checkpoint-1560/adapter_model.safetensors +3 -0
  32. checkpoint-1560/optimizer.pt +3 -0
  33. checkpoint-1560/rng_state_0.pth +3 -0
  34. checkpoint-1560/rng_state_1.pth +3 -0
  35. checkpoint-1560/scheduler.pt +3 -0
  36. checkpoint-1560/special_tokens_map.json +24 -0
  37. checkpoint-1560/tokenizer.json +0 -0
  38. checkpoint-1560/tokenizer.model +3 -0
  39. checkpoint-1560/tokenizer_config.json +43 -0
  40. checkpoint-1560/trainer_state.json +1191 -0
  41. checkpoint-1560/training_args.bin +3 -0
  42. checkpoint-1580/README.md +204 -0
  43. checkpoint-1580/adapter_config.json +29 -0
  44. checkpoint-1580/adapter_model.safetensors +3 -0
  45. checkpoint-1580/optimizer.pt +3 -0
  46. checkpoint-1580/rng_state_0.pth +3 -0
  47. checkpoint-1580/rng_state_1.pth +3 -0
  48. checkpoint-1580/scheduler.pt +3 -0
  49. checkpoint-1580/special_tokens_map.json +24 -0
  50. checkpoint-1580/tokenizer.json +0 -0
README.md CHANGED
@@ -16,9 +16,9 @@ should probably proofread and complete it, then remove this comment. -->
16
 
17
  # ZeroShot-3.3.3-Mistral-7b-Multilanguage-3.2.0
18
 
19
- This model is a fine-tuned version of [mistralai/Mistral-7B-Instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2) on the None dataset.
20
  It achieves the following results on the evaluation set:
21
- - Loss: 1.7469
22
 
23
  ## Model description
24
 
@@ -48,17 +48,99 @@ The following hyperparameters were used during training:
48
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
49
  - lr_scheduler_type: linear
50
  - lr_scheduler_warmup_ratio: 0.1
51
- - training_steps: 2
52
  - mixed_precision_training: Native AMP
53
 
54
  ### Training results
55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
 
58
  ### Framework versions
59
 
60
- - PEFT 0.7.1
61
  - Transformers 4.39.0.dev0
62
  - Pytorch 2.1.0+cu118
63
- - Datasets 2.16.1
64
  - Tokenizers 0.15.1
 
16
 
17
  # ZeroShot-3.3.3-Mistral-7b-Multilanguage-3.2.0
18
 
19
+ This model is a fine-tuned version of [mistralai/Mistral-7B-Instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2) on an unknown dataset.
20
  It achieves the following results on the evaluation set:
21
+ - Loss: 0.3754
22
 
23
  ## Model description
24
 
 
48
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
49
  - lr_scheduler_type: linear
50
  - lr_scheduler_warmup_ratio: 0.1
51
+ - training_steps: 1612
52
  - mixed_precision_training: Native AMP
53
 
54
  ### Training results
55
 
56
+ | Training Loss | Epoch | Step | Validation Loss |
57
+ |:-------------:|:-----:|:----:|:---------------:|
58
+ | 1.8728 | 0.01 | 20 | 1.7906 |
59
+ | 1.4796 | 0.02 | 40 | 1.1876 |
60
+ | 0.8318 | 0.04 | 60 | 0.6326 |
61
+ | 0.5478 | 0.05 | 80 | 0.5419 |
62
+ | 0.517 | 0.06 | 100 | 0.5157 |
63
+ | 0.5109 | 0.07 | 120 | 0.4906 |
64
+ | 0.4656 | 0.09 | 140 | 0.4658 |
65
+ | 0.4409 | 0.1 | 160 | 0.4519 |
66
+ | 0.4316 | 0.11 | 180 | 0.4475 |
67
+ | 0.4297 | 0.12 | 200 | 0.4428 |
68
+ | 0.4226 | 0.14 | 220 | 0.4389 |
69
+ | 0.4321 | 0.15 | 240 | 0.4360 |
70
+ | 0.4261 | 0.16 | 260 | 0.4337 |
71
+ | 0.4235 | 0.17 | 280 | 0.4307 |
72
+ | 0.4279 | 0.19 | 300 | 0.4280 |
73
+ | 0.419 | 0.2 | 320 | 0.4253 |
74
+ | 0.4129 | 0.21 | 340 | 0.4230 |
75
+ | 0.4097 | 0.22 | 360 | 0.4223 |
76
+ | 0.4204 | 0.24 | 380 | 0.4200 |
77
+ | 0.4042 | 0.25 | 400 | 0.4191 |
78
+ | 0.4134 | 0.26 | 420 | 0.4176 |
79
+ | 0.4006 | 0.27 | 440 | 0.4158 |
80
+ | 0.4004 | 0.29 | 460 | 0.4141 |
81
+ | 0.3967 | 0.3 | 480 | 0.4123 |
82
+ | 0.4089 | 0.31 | 500 | 0.4100 |
83
+ | 0.3924 | 0.32 | 520 | 0.4087 |
84
+ | 0.4118 | 0.33 | 540 | 0.4079 |
85
+ | 0.4027 | 0.35 | 560 | 0.4069 |
86
+ | 0.393 | 0.36 | 580 | 0.4055 |
87
+ | 0.4103 | 0.37 | 600 | 0.4047 |
88
+ | 0.3896 | 0.38 | 620 | 0.4033 |
89
+ | 0.3912 | 0.4 | 640 | 0.4016 |
90
+ | 0.3897 | 0.41 | 660 | 0.4012 |
91
+ | 0.3963 | 0.42 | 680 | 0.3994 |
92
+ | 0.3914 | 0.43 | 700 | 0.3981 |
93
+ | 0.3769 | 0.45 | 720 | 0.3970 |
94
+ | 0.3904 | 0.46 | 740 | 0.3970 |
95
+ | 0.3831 | 0.47 | 760 | 0.3951 |
96
+ | 0.3922 | 0.48 | 780 | 0.3943 |
97
+ | 0.403 | 0.5 | 800 | 0.3928 |
98
+ | 0.3913 | 0.51 | 820 | 0.3922 |
99
+ | 0.3836 | 0.52 | 840 | 0.3913 |
100
+ | 0.3736 | 0.53 | 860 | 0.3903 |
101
+ | 0.3773 | 0.55 | 880 | 0.3897 |
102
+ | 0.3883 | 0.56 | 900 | 0.3890 |
103
+ | 0.3751 | 0.57 | 920 | 0.3884 |
104
+ | 0.3832 | 0.58 | 940 | 0.3874 |
105
+ | 0.3726 | 0.6 | 960 | 0.3869 |
106
+ | 0.3738 | 0.61 | 980 | 0.3861 |
107
+ | 0.3809 | 0.62 | 1000 | 0.3855 |
108
+ | 0.3871 | 0.63 | 1020 | 0.3845 |
109
+ | 0.3799 | 0.64 | 1040 | 0.3838 |
110
+ | 0.3882 | 0.66 | 1060 | 0.3831 |
111
+ | 0.3846 | 0.67 | 1080 | 0.3823 |
112
+ | 0.3696 | 0.68 | 1100 | 0.3821 |
113
+ | 0.3791 | 0.69 | 1120 | 0.3816 |
114
+ | 0.3726 | 0.71 | 1140 | 0.3808 |
115
+ | 0.3698 | 0.72 | 1160 | 0.3804 |
116
+ | 0.3777 | 0.73 | 1180 | 0.3800 |
117
+ | 0.3637 | 0.74 | 1200 | 0.3794 |
118
+ | 0.3653 | 0.76 | 1220 | 0.3787 |
119
+ | 0.382 | 0.77 | 1240 | 0.3783 |
120
+ | 0.3587 | 0.78 | 1260 | 0.3781 |
121
+ | 0.3729 | 0.79 | 1280 | 0.3776 |
122
+ | 0.3731 | 0.81 | 1300 | 0.3772 |
123
+ | 0.3757 | 0.82 | 1320 | 0.3770 |
124
+ | 0.3733 | 0.83 | 1340 | 0.3767 |
125
+ | 0.3792 | 0.84 | 1360 | 0.3764 |
126
+ | 0.3678 | 0.86 | 1380 | 0.3761 |
127
+ | 0.3604 | 0.87 | 1400 | 0.3759 |
128
+ | 0.3496 | 0.88 | 1420 | 0.3758 |
129
+ | 0.3676 | 0.89 | 1440 | 0.3757 |
130
+ | 0.3678 | 0.91 | 1460 | 0.3757 |
131
+ | 0.3646 | 0.92 | 1480 | 0.3755 |
132
+ | 0.3621 | 0.93 | 1500 | 0.3755 |
133
+ | 0.3825 | 0.94 | 1520 | 0.3754 |
134
+ | 0.3718 | 0.95 | 1540 | 0.3754 |
135
+ | 0.3511 | 0.97 | 1560 | 0.3754 |
136
+ | 0.3716 | 0.98 | 1580 | 0.3754 |
137
+ | 0.3766 | 0.99 | 1600 | 0.3754 |
138
 
139
 
140
  ### Framework versions
141
 
142
+ - PEFT 0.8.2
143
  - Transformers 4.39.0.dev0
144
  - Pytorch 2.1.0+cu118
145
+ - Datasets 2.17.1
146
  - Tokenizers 0.15.1
adapter_config.json CHANGED
@@ -19,10 +19,10 @@
19
  "rank_pattern": {},
20
  "revision": null,
21
  "target_modules": [
22
- "k_proj",
23
  "v_proj",
24
- "o_proj",
25
- "q_proj"
 
26
  ],
27
  "task_type": "CAUSAL_LM",
28
  "use_rslora": false
 
19
  "rank_pattern": {},
20
  "revision": null,
21
  "target_modules": [
 
22
  "v_proj",
23
+ "k_proj",
24
+ "q_proj",
25
+ "o_proj"
26
  ],
27
  "task_type": "CAUSAL_LM",
28
  "use_rslora": false
checkpoint-1520/README.md ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ base_model: mistralai/Mistral-7B-Instruct-v0.2
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+
201
+
202
+ ### Framework versions
203
+
204
+ - PEFT 0.8.2
checkpoint-1520/adapter_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.2",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "loftq_config": {},
12
+ "lora_alpha": 16,
13
+ "lora_dropout": 0.1,
14
+ "megatron_config": null,
15
+ "megatron_core": "megatron.core",
16
+ "modules_to_save": null,
17
+ "peft_type": "LORA",
18
+ "r": 8,
19
+ "rank_pattern": {},
20
+ "revision": null,
21
+ "target_modules": [
22
+ "v_proj",
23
+ "k_proj",
24
+ "q_proj",
25
+ "o_proj"
26
+ ],
27
+ "task_type": "CAUSAL_LM",
28
+ "use_rslora": false
29
+ }
checkpoint-1520/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3cc68e493b63ec961fc5247aa68cd10a411aa1e1dd75b04a4314b0e7a17cdc3d
3
+ size 27297032
checkpoint-1520/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b09f56e916f55df970291d6958ed0ccd42d06d29342b1918fff3ef348898f35a
3
+ size 54678266
checkpoint-1520/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09386417525b0d73a8e964b5511ca3b5b6f91c924fd35779e18b740cd6d2ddf5
3
+ size 14512
checkpoint-1520/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b312c5394a834d17495ea557792207f77d8e23a366e1f149500025e29aa3f2d7
3
+ size 14512
checkpoint-1520/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7fdd37660c565d65c59b9ac7504502e06e679b67458a6c6e51eb584b70628354
3
+ size 1000
checkpoint-1520/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
checkpoint-1520/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-1520/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
3
+ size 493443
checkpoint-1520/tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<unk>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<s>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "</s>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ }
29
+ },
30
+ "additional_special_tokens": [],
31
+ "bos_token": "<s>",
32
+ "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
33
+ "clean_up_tokenization_spaces": false,
34
+ "eos_token": "</s>",
35
+ "legacy": true,
36
+ "model_max_length": 1000000000000000019884624838656,
37
+ "pad_token": "</s>",
38
+ "sp_model_kwargs": {},
39
+ "spaces_between_special_tokens": false,
40
+ "tokenizer_class": "LlamaTokenizer",
41
+ "unk_token": "<unk>",
42
+ "use_default_system_prompt": false
43
+ }
checkpoint-1520/trainer_state.json ADDED
@@ -0,0 +1,1161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.3754417300224304,
3
+ "best_model_checkpoint": "./mistral/22-02-24-Weni-ZeroShot-3.3.3-Mistral-7b-Multilanguage-3.2.0_Zeroshot-2_max_steps-1612_batch_16_2024-02-22_ppid_1326/checkpoint-1520",
4
+ "epoch": 0.9423434593924365,
5
+ "eval_steps": 20,
6
+ "global_step": 1520,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.01,
13
+ "grad_norm": 1.6690024137496948,
14
+ "learning_rate": 2.3602484472049692e-05,
15
+ "loss": 1.8728,
16
+ "step": 20
17
+ },
18
+ {
19
+ "epoch": 0.01,
20
+ "eval_loss": 1.7905555963516235,
21
+ "eval_runtime": 165.4628,
22
+ "eval_samples_per_second": 17.327,
23
+ "eval_steps_per_second": 4.333,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 0.02,
28
+ "grad_norm": 1.7746976613998413,
29
+ "learning_rate": 4.8447204968944106e-05,
30
+ "loss": 1.4796,
31
+ "step": 40
32
+ },
33
+ {
34
+ "epoch": 0.02,
35
+ "eval_loss": 1.1875672340393066,
36
+ "eval_runtime": 165.7743,
37
+ "eval_samples_per_second": 17.295,
38
+ "eval_steps_per_second": 4.325,
39
+ "step": 40
40
+ },
41
+ {
42
+ "epoch": 0.04,
43
+ "grad_norm": 1.1963611841201782,
44
+ "learning_rate": 7.329192546583851e-05,
45
+ "loss": 0.8318,
46
+ "step": 60
47
+ },
48
+ {
49
+ "epoch": 0.04,
50
+ "eval_loss": 0.6325646638870239,
51
+ "eval_runtime": 165.8606,
52
+ "eval_samples_per_second": 17.286,
53
+ "eval_steps_per_second": 4.323,
54
+ "step": 60
55
+ },
56
+ {
57
+ "epoch": 0.05,
58
+ "grad_norm": 0.6274264454841614,
59
+ "learning_rate": 9.813664596273293e-05,
60
+ "loss": 0.5478,
61
+ "step": 80
62
+ },
63
+ {
64
+ "epoch": 0.05,
65
+ "eval_loss": 0.541927695274353,
66
+ "eval_runtime": 165.8755,
67
+ "eval_samples_per_second": 17.284,
68
+ "eval_steps_per_second": 4.323,
69
+ "step": 80
70
+ },
71
+ {
72
+ "epoch": 0.06,
73
+ "grad_norm": 0.7583674788475037,
74
+ "learning_rate": 0.00012298136645962735,
75
+ "loss": 0.517,
76
+ "step": 100
77
+ },
78
+ {
79
+ "epoch": 0.06,
80
+ "eval_loss": 0.5157255530357361,
81
+ "eval_runtime": 165.8227,
82
+ "eval_samples_per_second": 17.29,
83
+ "eval_steps_per_second": 4.324,
84
+ "step": 100
85
+ },
86
+ {
87
+ "epoch": 0.07,
88
+ "grad_norm": 0.496155321598053,
89
+ "learning_rate": 0.00014782608695652173,
90
+ "loss": 0.5109,
91
+ "step": 120
92
+ },
93
+ {
94
+ "epoch": 0.07,
95
+ "eval_loss": 0.49060019850730896,
96
+ "eval_runtime": 165.8171,
97
+ "eval_samples_per_second": 17.29,
98
+ "eval_steps_per_second": 4.324,
99
+ "step": 120
100
+ },
101
+ {
102
+ "epoch": 0.09,
103
+ "grad_norm": 0.3945171535015106,
104
+ "learning_rate": 0.00017142857142857143,
105
+ "loss": 0.4656,
106
+ "step": 140
107
+ },
108
+ {
109
+ "epoch": 0.09,
110
+ "eval_loss": 0.4657692313194275,
111
+ "eval_runtime": 165.7753,
112
+ "eval_samples_per_second": 17.294,
113
+ "eval_steps_per_second": 4.325,
114
+ "step": 140
115
+ },
116
+ {
117
+ "epoch": 0.1,
118
+ "grad_norm": 0.3318285346031189,
119
+ "learning_rate": 0.00019627329192546585,
120
+ "loss": 0.4409,
121
+ "step": 160
122
+ },
123
+ {
124
+ "epoch": 0.1,
125
+ "eval_loss": 0.45186159014701843,
126
+ "eval_runtime": 165.7746,
127
+ "eval_samples_per_second": 17.295,
128
+ "eval_steps_per_second": 4.325,
129
+ "step": 160
130
+ },
131
+ {
132
+ "epoch": 0.11,
133
+ "grad_norm": 0.4603807330131531,
134
+ "learning_rate": 0.00019993226958500473,
135
+ "loss": 0.4316,
136
+ "step": 180
137
+ },
138
+ {
139
+ "epoch": 0.11,
140
+ "eval_loss": 0.4474850594997406,
141
+ "eval_runtime": 165.6607,
142
+ "eval_samples_per_second": 17.306,
143
+ "eval_steps_per_second": 4.328,
144
+ "step": 180
145
+ },
146
+ {
147
+ "epoch": 0.12,
148
+ "grad_norm": 0.4568885862827301,
149
+ "learning_rate": 0.00019967929472585524,
150
+ "loss": 0.4297,
151
+ "step": 200
152
+ },
153
+ {
154
+ "epoch": 0.12,
155
+ "eval_loss": 0.4427547752857208,
156
+ "eval_runtime": 165.7208,
157
+ "eval_samples_per_second": 17.3,
158
+ "eval_steps_per_second": 4.327,
159
+ "step": 200
160
+ },
161
+ {
162
+ "epoch": 0.14,
163
+ "grad_norm": 0.384003221988678,
164
+ "learning_rate": 0.00019923944021970962,
165
+ "loss": 0.4226,
166
+ "step": 220
167
+ },
168
+ {
169
+ "epoch": 0.14,
170
+ "eval_loss": 0.4389376938343048,
171
+ "eval_runtime": 165.7023,
172
+ "eval_samples_per_second": 17.302,
173
+ "eval_steps_per_second": 4.327,
174
+ "step": 220
175
+ },
176
+ {
177
+ "epoch": 0.15,
178
+ "grad_norm": 0.30612272024154663,
179
+ "learning_rate": 0.00019861353070979048,
180
+ "loss": 0.4321,
181
+ "step": 240
182
+ },
183
+ {
184
+ "epoch": 0.15,
185
+ "eval_loss": 0.4359733462333679,
186
+ "eval_runtime": 165.7286,
187
+ "eval_samples_per_second": 17.299,
188
+ "eval_steps_per_second": 4.326,
189
+ "step": 240
190
+ },
191
+ {
192
+ "epoch": 0.16,
193
+ "grad_norm": 0.4149855971336365,
194
+ "learning_rate": 0.0001978027396569313,
195
+ "loss": 0.4261,
196
+ "step": 260
197
+ },
198
+ {
199
+ "epoch": 0.16,
200
+ "eval_loss": 0.4336954355239868,
201
+ "eval_runtime": 165.7171,
202
+ "eval_samples_per_second": 17.301,
203
+ "eval_steps_per_second": 4.327,
204
+ "step": 260
205
+ },
206
+ {
207
+ "epoch": 0.17,
208
+ "grad_norm": 0.28470170497894287,
209
+ "learning_rate": 0.00019680858713956126,
210
+ "loss": 0.4235,
211
+ "step": 280
212
+ },
213
+ {
214
+ "epoch": 0.17,
215
+ "eval_loss": 0.4306911528110504,
216
+ "eval_runtime": 165.7298,
217
+ "eval_samples_per_second": 17.299,
218
+ "eval_steps_per_second": 4.326,
219
+ "step": 280
220
+ },
221
+ {
222
+ "epoch": 0.19,
223
+ "grad_norm": 0.3317676782608032,
224
+ "learning_rate": 0.00019563293700384832,
225
+ "loss": 0.4279,
226
+ "step": 300
227
+ },
228
+ {
229
+ "epoch": 0.19,
230
+ "eval_loss": 0.4280063509941101,
231
+ "eval_runtime": 165.7359,
232
+ "eval_samples_per_second": 17.299,
233
+ "eval_steps_per_second": 4.326,
234
+ "step": 300
235
+ },
236
+ {
237
+ "epoch": 0.2,
238
+ "grad_norm": 0.3677004277706146,
239
+ "learning_rate": 0.0001942779933693437,
240
+ "loss": 0.419,
241
+ "step": 320
242
+ },
243
+ {
244
+ "epoch": 0.2,
245
+ "eval_loss": 0.425252765417099,
246
+ "eval_runtime": 165.7143,
247
+ "eval_samples_per_second": 17.301,
248
+ "eval_steps_per_second": 4.327,
249
+ "step": 320
250
+ },
251
+ {
252
+ "epoch": 0.21,
253
+ "grad_norm": 0.34667083621025085,
254
+ "learning_rate": 0.00019274629649667838,
255
+ "loss": 0.4129,
256
+ "step": 340
257
+ },
258
+ {
259
+ "epoch": 0.21,
260
+ "eval_loss": 0.423022985458374,
261
+ "eval_runtime": 165.6904,
262
+ "eval_samples_per_second": 17.303,
263
+ "eval_steps_per_second": 4.327,
264
+ "step": 340
265
+ },
266
+ {
267
+ "epoch": 0.22,
268
+ "grad_norm": 0.37289124727249146,
269
+ "learning_rate": 0.00019104071802505943,
270
+ "loss": 0.4097,
271
+ "step": 360
272
+ },
273
+ {
274
+ "epoch": 0.22,
275
+ "eval_loss": 0.4223038852214813,
276
+ "eval_runtime": 165.7184,
277
+ "eval_samples_per_second": 17.3,
278
+ "eval_steps_per_second": 4.327,
279
+ "step": 360
280
+ },
281
+ {
282
+ "epoch": 0.24,
283
+ "grad_norm": 0.9662333130836487,
284
+ "learning_rate": 0.00018926226943248415,
285
+ "loss": 0.4204,
286
+ "step": 380
287
+ },
288
+ {
289
+ "epoch": 0.24,
290
+ "eval_loss": 0.41996675729751587,
291
+ "eval_runtime": 165.7378,
292
+ "eval_samples_per_second": 17.298,
293
+ "eval_steps_per_second": 4.326,
294
+ "step": 380
295
+ },
296
+ {
297
+ "epoch": 0.25,
298
+ "grad_norm": 0.43084949254989624,
299
+ "learning_rate": 0.00018722711057125052,
300
+ "loss": 0.4042,
301
+ "step": 400
302
+ },
303
+ {
304
+ "epoch": 0.25,
305
+ "eval_loss": 0.4190637767314911,
306
+ "eval_runtime": 165.6378,
307
+ "eval_samples_per_second": 17.309,
308
+ "eval_steps_per_second": 4.329,
309
+ "step": 400
310
+ },
311
+ {
312
+ "epoch": 0.26,
313
+ "grad_norm": 0.3700352609157562,
314
+ "learning_rate": 0.00018502841753095908,
315
+ "loss": 0.4134,
316
+ "step": 420
317
+ },
318
+ {
319
+ "epoch": 0.26,
320
+ "eval_loss": 0.4176079034805298,
321
+ "eval_runtime": 165.7021,
322
+ "eval_samples_per_second": 17.302,
323
+ "eval_steps_per_second": 4.327,
324
+ "step": 420
325
+ },
326
+ {
327
+ "epoch": 0.27,
328
+ "grad_norm": 0.2829599976539612,
329
+ "learning_rate": 0.00018267031244128938,
330
+ "loss": 0.4006,
331
+ "step": 440
332
+ },
333
+ {
334
+ "epoch": 0.27,
335
+ "eval_loss": 0.4157721698284149,
336
+ "eval_runtime": 165.7174,
337
+ "eval_samples_per_second": 17.301,
338
+ "eval_steps_per_second": 4.327,
339
+ "step": 440
340
+ },
341
+ {
342
+ "epoch": 0.29,
343
+ "grad_norm": 0.2943759560585022,
344
+ "learning_rate": 0.00018015721629907882,
345
+ "loss": 0.4004,
346
+ "step": 460
347
+ },
348
+ {
349
+ "epoch": 0.29,
350
+ "eval_loss": 0.4140998125076294,
351
+ "eval_runtime": 165.7622,
352
+ "eval_samples_per_second": 17.296,
353
+ "eval_steps_per_second": 4.325,
354
+ "step": 460
355
+ },
356
+ {
357
+ "epoch": 0.3,
358
+ "grad_norm": 0.36071881651878357,
359
+ "learning_rate": 0.00017749384067979764,
360
+ "loss": 0.3967,
361
+ "step": 480
362
+ },
363
+ {
364
+ "epoch": 0.3,
365
+ "eval_loss": 0.4122526943683624,
366
+ "eval_runtime": 165.6609,
367
+ "eval_samples_per_second": 17.306,
368
+ "eval_steps_per_second": 4.328,
369
+ "step": 480
370
+ },
371
+ {
372
+ "epoch": 0.31,
373
+ "grad_norm": 0.3050592243671417,
374
+ "learning_rate": 0.00017468517890424455,
375
+ "loss": 0.4089,
376
+ "step": 500
377
+ },
378
+ {
379
+ "epoch": 0.31,
380
+ "eval_loss": 0.4099767506122589,
381
+ "eval_runtime": 165.7178,
382
+ "eval_samples_per_second": 17.3,
383
+ "eval_steps_per_second": 4.327,
384
+ "step": 500
385
+ },
386
+ {
387
+ "epoch": 0.32,
388
+ "grad_norm": 0.3138140141963959,
389
+ "learning_rate": 0.00017173649667702337,
390
+ "loss": 0.3924,
391
+ "step": 520
392
+ },
393
+ {
394
+ "epoch": 0.32,
395
+ "eval_loss": 0.40873026847839355,
396
+ "eval_runtime": 165.7231,
397
+ "eval_samples_per_second": 17.3,
398
+ "eval_steps_per_second": 4.326,
399
+ "step": 520
400
+ },
401
+ {
402
+ "epoch": 0.33,
403
+ "grad_norm": 0.299787700176239,
404
+ "learning_rate": 0.0001686533222143523,
405
+ "loss": 0.4118,
406
+ "step": 540
407
+ },
408
+ {
409
+ "epoch": 0.33,
410
+ "eval_loss": 0.40794187784194946,
411
+ "eval_runtime": 165.723,
412
+ "eval_samples_per_second": 17.3,
413
+ "eval_steps_per_second": 4.326,
414
+ "step": 540
415
+ },
416
+ {
417
+ "epoch": 0.35,
418
+ "grad_norm": 0.37338224053382874,
419
+ "learning_rate": 0.0001654414358797141,
420
+ "loss": 0.4027,
421
+ "step": 560
422
+ },
423
+ {
424
+ "epoch": 0.35,
425
+ "eval_loss": 0.4068893790245056,
426
+ "eval_runtime": 165.5504,
427
+ "eval_samples_per_second": 17.318,
428
+ "eval_steps_per_second": 4.331,
429
+ "step": 560
430
+ },
431
+ {
432
+ "epoch": 0.36,
433
+ "grad_norm": 0.3397510349750519,
434
+ "learning_rate": 0.00016210685934677782,
435
+ "loss": 0.393,
436
+ "step": 580
437
+ },
438
+ {
439
+ "epoch": 0.36,
440
+ "eval_loss": 0.40551885962486267,
441
+ "eval_runtime": 165.5533,
442
+ "eval_samples_per_second": 17.318,
443
+ "eval_steps_per_second": 4.331,
444
+ "step": 580
445
+ },
446
+ {
447
+ "epoch": 0.37,
448
+ "grad_norm": 0.3803115487098694,
449
+ "learning_rate": 0.00015883106145163397,
450
+ "loss": 0.4103,
451
+ "step": 600
452
+ },
453
+ {
454
+ "epoch": 0.37,
455
+ "eval_loss": 0.40474218130111694,
456
+ "eval_runtime": 165.6206,
457
+ "eval_samples_per_second": 17.311,
458
+ "eval_steps_per_second": 4.329,
459
+ "step": 600
460
+ },
461
+ {
462
+ "epoch": 0.38,
463
+ "grad_norm": 0.3368137776851654,
464
+ "learning_rate": 0.00015527541943543543,
465
+ "loss": 0.3896,
466
+ "step": 620
467
+ },
468
+ {
469
+ "epoch": 0.38,
470
+ "eval_loss": 0.40325844287872314,
471
+ "eval_runtime": 165.6617,
472
+ "eval_samples_per_second": 17.306,
473
+ "eval_steps_per_second": 4.328,
474
+ "step": 620
475
+ },
476
+ {
477
+ "epoch": 0.4,
478
+ "grad_norm": 0.32399722933769226,
479
+ "learning_rate": 0.00015161614656089196,
480
+ "loss": 0.3912,
481
+ "step": 640
482
+ },
483
+ {
484
+ "epoch": 0.4,
485
+ "eval_loss": 0.4015989899635315,
486
+ "eval_runtime": 165.6304,
487
+ "eval_samples_per_second": 17.31,
488
+ "eval_steps_per_second": 4.329,
489
+ "step": 640
490
+ },
491
+ {
492
+ "epoch": 0.41,
493
+ "grad_norm": 0.42368754744529724,
494
+ "learning_rate": 0.0001478601032660207,
495
+ "loss": 0.3897,
496
+ "step": 660
497
+ },
498
+ {
499
+ "epoch": 0.41,
500
+ "eval_loss": 0.40123647451400757,
501
+ "eval_runtime": 165.6021,
502
+ "eval_samples_per_second": 17.313,
503
+ "eval_steps_per_second": 4.33,
504
+ "step": 660
505
+ },
506
+ {
507
+ "epoch": 0.42,
508
+ "grad_norm": 0.36450713872909546,
509
+ "learning_rate": 0.00014401433141490152,
510
+ "loss": 0.3963,
511
+ "step": 680
512
+ },
513
+ {
514
+ "epoch": 0.42,
515
+ "eval_loss": 0.39942467212677,
516
+ "eval_runtime": 165.5741,
517
+ "eval_samples_per_second": 17.316,
518
+ "eval_steps_per_second": 4.33,
519
+ "step": 680
520
+ },
521
+ {
522
+ "epoch": 0.43,
523
+ "grad_norm": 0.3598613739013672,
524
+ "learning_rate": 0.00014008604109552665,
525
+ "loss": 0.3914,
526
+ "step": 700
527
+ },
528
+ {
529
+ "epoch": 0.43,
530
+ "eval_loss": 0.39811620116233826,
531
+ "eval_runtime": 165.6088,
532
+ "eval_samples_per_second": 17.312,
533
+ "eval_steps_per_second": 4.329,
534
+ "step": 700
535
+ },
536
+ {
537
+ "epoch": 0.45,
538
+ "grad_norm": 0.3440189063549042,
539
+ "learning_rate": 0.00013608259710226186,
540
+ "loss": 0.3769,
541
+ "step": 720
542
+ },
543
+ {
544
+ "epoch": 0.45,
545
+ "eval_loss": 0.396987646818161,
546
+ "eval_runtime": 165.6485,
547
+ "eval_samples_per_second": 17.308,
548
+ "eval_steps_per_second": 4.328,
549
+ "step": 720
550
+ },
551
+ {
552
+ "epoch": 0.46,
553
+ "grad_norm": 0.3481440544128418,
554
+ "learning_rate": 0.0001320115051282632,
555
+ "loss": 0.3904,
556
+ "step": 740
557
+ },
558
+ {
559
+ "epoch": 0.46,
560
+ "eval_loss": 0.3970092833042145,
561
+ "eval_runtime": 165.6528,
562
+ "eval_samples_per_second": 17.307,
563
+ "eval_steps_per_second": 4.328,
564
+ "step": 740
565
+ },
566
+ {
567
+ "epoch": 0.47,
568
+ "grad_norm": 0.4135587215423584,
569
+ "learning_rate": 0.0001278803976937355,
570
+ "loss": 0.3831,
571
+ "step": 760
572
+ },
573
+ {
574
+ "epoch": 0.47,
575
+ "eval_loss": 0.39514589309692383,
576
+ "eval_runtime": 165.7492,
577
+ "eval_samples_per_second": 17.297,
578
+ "eval_steps_per_second": 4.326,
579
+ "step": 760
580
+ },
581
+ {
582
+ "epoch": 0.48,
583
+ "grad_norm": 0.34816452860832214,
584
+ "learning_rate": 0.00012369701983641388,
585
+ "loss": 0.3922,
586
+ "step": 780
587
+ },
588
+ {
589
+ "epoch": 0.48,
590
+ "eval_loss": 0.3943038880825043,
591
+ "eval_runtime": 165.7169,
592
+ "eval_samples_per_second": 17.301,
593
+ "eval_steps_per_second": 4.327,
594
+ "step": 780
595
+ },
596
+ {
597
+ "epoch": 0.5,
598
+ "grad_norm": 0.4880768060684204,
599
+ "learning_rate": 0.0001194692145910969,
600
+ "loss": 0.403,
601
+ "step": 800
602
+ },
603
+ {
604
+ "epoch": 0.5,
605
+ "eval_loss": 0.3928041160106659,
606
+ "eval_runtime": 165.6712,
607
+ "eval_samples_per_second": 17.305,
608
+ "eval_steps_per_second": 4.328,
609
+ "step": 800
610
+ },
611
+ {
612
+ "epoch": 0.51,
613
+ "grad_norm": 0.3198924660682678,
614
+ "learning_rate": 0.00011520490828545361,
615
+ "loss": 0.3913,
616
+ "step": 820
617
+ },
618
+ {
619
+ "epoch": 0.51,
620
+ "eval_loss": 0.3922466039657593,
621
+ "eval_runtime": 165.7505,
622
+ "eval_samples_per_second": 17.297,
623
+ "eval_steps_per_second": 4.326,
624
+ "step": 820
625
+ },
626
+ {
627
+ "epoch": 0.52,
628
+ "grad_norm": 0.33648762106895447,
629
+ "learning_rate": 0.00011091209567967229,
630
+ "loss": 0.3836,
631
+ "step": 840
632
+ },
633
+ {
634
+ "epoch": 0.52,
635
+ "eval_loss": 0.39126288890838623,
636
+ "eval_runtime": 165.7535,
637
+ "eval_samples_per_second": 17.297,
638
+ "eval_steps_per_second": 4.326,
639
+ "step": 840
640
+ },
641
+ {
642
+ "epoch": 0.53,
643
+ "grad_norm": 0.2874184846878052,
644
+ "learning_rate": 0.00010659882497781187,
645
+ "loss": 0.3736,
646
+ "step": 860
647
+ },
648
+ {
649
+ "epoch": 0.53,
650
+ "eval_loss": 0.3903014063835144,
651
+ "eval_runtime": 165.7718,
652
+ "eval_samples_per_second": 17.295,
653
+ "eval_steps_per_second": 4.325,
654
+ "step": 860
655
+ },
656
+ {
657
+ "epoch": 0.55,
658
+ "grad_norm": 0.37889736890792847,
659
+ "learning_rate": 0.00010227318273895532,
660
+ "loss": 0.3773,
661
+ "step": 880
662
+ },
663
+ {
664
+ "epoch": 0.55,
665
+ "eval_loss": 0.38970035314559937,
666
+ "eval_runtime": 165.5823,
667
+ "eval_samples_per_second": 17.315,
668
+ "eval_steps_per_second": 4.33,
669
+ "step": 880
670
+ },
671
+ {
672
+ "epoch": 0.56,
673
+ "grad_norm": 0.32016921043395996,
674
+ "learning_rate": 9.794327871645574e-05,
675
+ "loss": 0.3883,
676
+ "step": 900
677
+ },
678
+ {
679
+ "epoch": 0.56,
680
+ "eval_loss": 0.38903746008872986,
681
+ "eval_runtime": 165.5591,
682
+ "eval_samples_per_second": 17.317,
683
+ "eval_steps_per_second": 4.331,
684
+ "step": 900
685
+ },
686
+ {
687
+ "epoch": 0.57,
688
+ "grad_norm": 0.34894031286239624,
689
+ "learning_rate": 9.361723065369682e-05,
690
+ "loss": 0.3751,
691
+ "step": 920
692
+ },
693
+ {
694
+ "epoch": 0.57,
695
+ "eval_loss": 0.3883580267429352,
696
+ "eval_runtime": 165.5778,
697
+ "eval_samples_per_second": 17.315,
698
+ "eval_steps_per_second": 4.33,
699
+ "step": 920
700
+ },
701
+ {
702
+ "epoch": 0.58,
703
+ "grad_norm": 0.35315269231796265,
704
+ "learning_rate": 8.930314906487384e-05,
705
+ "loss": 0.3832,
706
+ "step": 940
707
+ },
708
+ {
709
+ "epoch": 0.58,
710
+ "eval_loss": 0.3874415457248688,
711
+ "eval_runtime": 165.6643,
712
+ "eval_samples_per_second": 17.306,
713
+ "eval_steps_per_second": 4.328,
714
+ "step": 940
715
+ },
716
+ {
717
+ "epoch": 0.6,
718
+ "grad_norm": 0.4276430904865265,
719
+ "learning_rate": 8.500912202932824e-05,
720
+ "loss": 0.3726,
721
+ "step": 960
722
+ },
723
+ {
724
+ "epoch": 0.6,
725
+ "eval_loss": 0.3868561387062073,
726
+ "eval_runtime": 165.6292,
727
+ "eval_samples_per_second": 17.31,
728
+ "eval_steps_per_second": 4.329,
729
+ "step": 960
730
+ },
731
+ {
732
+ "epoch": 0.61,
733
+ "grad_norm": 0.32810911536216736,
734
+ "learning_rate": 8.07432000279427e-05,
735
+ "loss": 0.3738,
736
+ "step": 980
737
+ },
738
+ {
739
+ "epoch": 0.61,
740
+ "eval_loss": 0.38609209656715393,
741
+ "eval_runtime": 165.6113,
742
+ "eval_samples_per_second": 17.312,
743
+ "eval_steps_per_second": 4.329,
744
+ "step": 980
745
+ },
746
+ {
747
+ "epoch": 0.62,
748
+ "grad_norm": 0.3598964214324951,
749
+ "learning_rate": 7.651338085002669e-05,
750
+ "loss": 0.3809,
751
+ "step": 1000
752
+ },
753
+ {
754
+ "epoch": 0.62,
755
+ "eval_loss": 0.3854670822620392,
756
+ "eval_runtime": 165.6347,
757
+ "eval_samples_per_second": 17.309,
758
+ "eval_steps_per_second": 4.329,
759
+ "step": 1000
760
+ },
761
+ {
762
+ "epoch": 0.63,
763
+ "grad_norm": 0.32283729314804077,
764
+ "learning_rate": 7.232759459898832e-05,
765
+ "loss": 0.3871,
766
+ "step": 1020
767
+ },
768
+ {
769
+ "epoch": 0.63,
770
+ "eval_loss": 0.38449159264564514,
771
+ "eval_runtime": 165.5636,
772
+ "eval_samples_per_second": 17.317,
773
+ "eval_steps_per_second": 4.331,
774
+ "step": 1020
775
+ },
776
+ {
777
+ "epoch": 0.64,
778
+ "grad_norm": 0.3151933252811432,
779
+ "learning_rate": 6.819368882490458e-05,
780
+ "loss": 0.3799,
781
+ "step": 1040
782
+ },
783
+ {
784
+ "epoch": 0.64,
785
+ "eval_loss": 0.3837529420852661,
786
+ "eval_runtime": 165.6596,
787
+ "eval_samples_per_second": 17.307,
788
+ "eval_steps_per_second": 4.328,
789
+ "step": 1040
790
+ },
791
+ {
792
+ "epoch": 0.66,
793
+ "grad_norm": 0.37252795696258545,
794
+ "learning_rate": 6.411941381186302e-05,
795
+ "loss": 0.3882,
796
+ "step": 1060
797
+ },
798
+ {
799
+ "epoch": 0.66,
800
+ "eval_loss": 0.38311225175857544,
801
+ "eval_runtime": 165.5928,
802
+ "eval_samples_per_second": 17.314,
803
+ "eval_steps_per_second": 4.33,
804
+ "step": 1060
805
+ },
806
+ {
807
+ "epoch": 0.67,
808
+ "grad_norm": 0.33380192518234253,
809
+ "learning_rate": 6.01124080476589e-05,
810
+ "loss": 0.3846,
811
+ "step": 1080
812
+ },
813
+ {
814
+ "epoch": 0.67,
815
+ "eval_loss": 0.3823437988758087,
816
+ "eval_runtime": 165.6364,
817
+ "eval_samples_per_second": 17.309,
818
+ "eval_steps_per_second": 4.329,
819
+ "step": 1080
820
+ },
821
+ {
822
+ "epoch": 0.68,
823
+ "grad_norm": 0.3543049991130829,
824
+ "learning_rate": 5.6180183903088844e-05,
825
+ "loss": 0.3696,
826
+ "step": 1100
827
+ },
828
+ {
829
+ "epoch": 0.68,
830
+ "eval_loss": 0.3821370601654053,
831
+ "eval_runtime": 165.5383,
832
+ "eval_samples_per_second": 17.319,
833
+ "eval_steps_per_second": 4.331,
834
+ "step": 1100
835
+ },
836
+ {
837
+ "epoch": 0.69,
838
+ "grad_norm": 0.374683141708374,
839
+ "learning_rate": 5.233011354768991e-05,
840
+ "loss": 0.3791,
841
+ "step": 1120
842
+ },
843
+ {
844
+ "epoch": 0.69,
845
+ "eval_loss": 0.38156434893608093,
846
+ "eval_runtime": 165.6726,
847
+ "eval_samples_per_second": 17.305,
848
+ "eval_steps_per_second": 4.328,
849
+ "step": 1120
850
+ },
851
+ {
852
+ "epoch": 0.71,
853
+ "grad_norm": 0.3851562738418579,
854
+ "learning_rate": 4.8569415128328945e-05,
855
+ "loss": 0.3726,
856
+ "step": 1140
857
+ },
858
+ {
859
+ "epoch": 0.71,
860
+ "eval_loss": 0.38082343339920044,
861
+ "eval_runtime": 165.6253,
862
+ "eval_samples_per_second": 17.31,
863
+ "eval_steps_per_second": 4.329,
864
+ "step": 1140
865
+ },
866
+ {
867
+ "epoch": 0.72,
868
+ "grad_norm": 0.422851026058197,
869
+ "learning_rate": 4.490513923655564e-05,
870
+ "loss": 0.3698,
871
+ "step": 1160
872
+ },
873
+ {
874
+ "epoch": 0.72,
875
+ "eval_loss": 0.38037535548210144,
876
+ "eval_runtime": 165.6523,
877
+ "eval_samples_per_second": 17.307,
878
+ "eval_steps_per_second": 4.328,
879
+ "step": 1160
880
+ },
881
+ {
882
+ "epoch": 0.73,
883
+ "grad_norm": 0.3657631278038025,
884
+ "learning_rate": 4.134415569008935e-05,
885
+ "loss": 0.3777,
886
+ "step": 1180
887
+ },
888
+ {
889
+ "epoch": 0.73,
890
+ "eval_loss": 0.3799656629562378,
891
+ "eval_runtime": 165.615,
892
+ "eval_samples_per_second": 17.311,
893
+ "eval_steps_per_second": 4.329,
894
+ "step": 1180
895
+ },
896
+ {
897
+ "epoch": 0.74,
898
+ "grad_norm": 0.34044766426086426,
899
+ "learning_rate": 3.789314065322218e-05,
900
+ "loss": 0.3637,
901
+ "step": 1200
902
+ },
903
+ {
904
+ "epoch": 0.74,
905
+ "eval_loss": 0.3793714940547943,
906
+ "eval_runtime": 165.687,
907
+ "eval_samples_per_second": 17.304,
908
+ "eval_steps_per_second": 4.327,
909
+ "step": 1200
910
+ },
911
+ {
912
+ "epoch": 0.76,
913
+ "grad_norm": 0.327467679977417,
914
+ "learning_rate": 3.455856412028593e-05,
915
+ "loss": 0.3653,
916
+ "step": 1220
917
+ },
918
+ {
919
+ "epoch": 0.76,
920
+ "eval_loss": 0.3786996603012085,
921
+ "eval_runtime": 165.7148,
922
+ "eval_samples_per_second": 17.301,
923
+ "eval_steps_per_second": 4.327,
924
+ "step": 1220
925
+ },
926
+ {
927
+ "epoch": 0.77,
928
+ "grad_norm": 0.3492739796638489,
929
+ "learning_rate": 3.1346677785647704e-05,
930
+ "loss": 0.382,
931
+ "step": 1240
932
+ },
933
+ {
934
+ "epoch": 0.77,
935
+ "eval_loss": 0.3782605230808258,
936
+ "eval_runtime": 165.621,
937
+ "eval_samples_per_second": 17.311,
938
+ "eval_steps_per_second": 4.329,
939
+ "step": 1240
940
+ },
941
+ {
942
+ "epoch": 0.78,
943
+ "grad_norm": 0.3024798333644867,
944
+ "learning_rate": 2.826350332297667e-05,
945
+ "loss": 0.3587,
946
+ "step": 1260
947
+ },
948
+ {
949
+ "epoch": 0.78,
950
+ "eval_loss": 0.37805166840553284,
951
+ "eval_runtime": 165.623,
952
+ "eval_samples_per_second": 17.31,
953
+ "eval_steps_per_second": 4.329,
954
+ "step": 1260
955
+ },
956
+ {
957
+ "epoch": 0.79,
958
+ "grad_norm": 0.3727082312107086,
959
+ "learning_rate": 2.531482109575547e-05,
960
+ "loss": 0.3729,
961
+ "step": 1280
962
+ },
963
+ {
964
+ "epoch": 0.79,
965
+ "eval_loss": 0.3775557577610016,
966
+ "eval_runtime": 165.6074,
967
+ "eval_samples_per_second": 17.312,
968
+ "eval_steps_per_second": 4.33,
969
+ "step": 1280
970
+ },
971
+ {
972
+ "epoch": 0.81,
973
+ "grad_norm": 0.41581809520721436,
974
+ "learning_rate": 2.250615932020238e-05,
975
+ "loss": 0.3731,
976
+ "step": 1300
977
+ },
978
+ {
979
+ "epoch": 0.81,
980
+ "eval_loss": 0.37723448872566223,
981
+ "eval_runtime": 165.6275,
982
+ "eval_samples_per_second": 17.31,
983
+ "eval_steps_per_second": 4.329,
984
+ "step": 1300
985
+ },
986
+ {
987
+ "epoch": 0.82,
988
+ "grad_norm": 0.44623810052871704,
989
+ "learning_rate": 1.9842783700921196e-05,
990
+ "loss": 0.3757,
991
+ "step": 1320
992
+ },
993
+ {
994
+ "epoch": 0.82,
995
+ "eval_loss": 0.3769790530204773,
996
+ "eval_runtime": 165.5377,
997
+ "eval_samples_per_second": 17.319,
998
+ "eval_steps_per_second": 4.331,
999
+ "step": 1320
1000
+ },
1001
+ {
1002
+ "epoch": 0.83,
1003
+ "grad_norm": 0.365567147731781,
1004
+ "learning_rate": 1.732968755871063e-05,
1005
+ "loss": 0.3733,
1006
+ "step": 1340
1007
+ },
1008
+ {
1009
+ "epoch": 0.83,
1010
+ "eval_loss": 0.3767223656177521,
1011
+ "eval_runtime": 165.6665,
1012
+ "eval_samples_per_second": 17.306,
1013
+ "eval_steps_per_second": 4.328,
1014
+ "step": 1340
1015
+ },
1016
+ {
1017
+ "epoch": 0.84,
1018
+ "grad_norm": 0.4132380187511444,
1019
+ "learning_rate": 1.4971582469040957e-05,
1020
+ "loss": 0.3792,
1021
+ "step": 1360
1022
+ },
1023
+ {
1024
+ "epoch": 0.84,
1025
+ "eval_loss": 0.3763655126094818,
1026
+ "eval_runtime": 165.6456,
1027
+ "eval_samples_per_second": 17.308,
1028
+ "eval_steps_per_second": 4.329,
1029
+ "step": 1360
1030
+ },
1031
+ {
1032
+ "epoch": 0.86,
1033
+ "grad_norm": 0.33449500799179077,
1034
+ "learning_rate": 1.2772889428749524e-05,
1035
+ "loss": 0.3678,
1036
+ "step": 1380
1037
+ },
1038
+ {
1039
+ "epoch": 0.86,
1040
+ "eval_loss": 0.3761462867259979,
1041
+ "eval_runtime": 165.6026,
1042
+ "eval_samples_per_second": 17.313,
1043
+ "eval_steps_per_second": 4.33,
1044
+ "step": 1380
1045
+ },
1046
+ {
1047
+ "epoch": 0.87,
1048
+ "grad_norm": 0.28829070925712585,
1049
+ "learning_rate": 1.0737730567515847e-05,
1050
+ "loss": 0.3604,
1051
+ "step": 1400
1052
+ },
1053
+ {
1054
+ "epoch": 0.87,
1055
+ "eval_loss": 0.3759004473686218,
1056
+ "eval_runtime": 165.5228,
1057
+ "eval_samples_per_second": 17.321,
1058
+ "eval_steps_per_second": 4.332,
1059
+ "step": 1400
1060
+ },
1061
+ {
1062
+ "epoch": 0.88,
1063
+ "grad_norm": 0.4277011454105377,
1064
+ "learning_rate": 8.869921419655457e-06,
1065
+ "loss": 0.3496,
1066
+ "step": 1420
1067
+ },
1068
+ {
1069
+ "epoch": 0.88,
1070
+ "eval_loss": 0.3757947087287903,
1071
+ "eval_runtime": 165.6492,
1072
+ "eval_samples_per_second": 17.308,
1073
+ "eval_steps_per_second": 4.328,
1074
+ "step": 1420
1075
+ },
1076
+ {
1077
+ "epoch": 0.89,
1078
+ "grad_norm": 0.40312379598617554,
1079
+ "learning_rate": 7.172963770721341e-06,
1080
+ "loss": 0.3676,
1081
+ "step": 1440
1082
+ },
1083
+ {
1084
+ "epoch": 0.89,
1085
+ "eval_loss": 0.3757094442844391,
1086
+ "eval_runtime": 165.5144,
1087
+ "eval_samples_per_second": 17.322,
1088
+ "eval_steps_per_second": 4.332,
1089
+ "step": 1440
1090
+ },
1091
+ {
1092
+ "epoch": 0.91,
1093
+ "grad_norm": 0.45307889580726624,
1094
+ "learning_rate": 5.650039092324766e-06,
1095
+ "loss": 0.3678,
1096
+ "step": 1460
1097
+ },
1098
+ {
1099
+ "epoch": 0.91,
1100
+ "eval_loss": 0.37566059827804565,
1101
+ "eval_runtime": 165.5183,
1102
+ "eval_samples_per_second": 17.321,
1103
+ "eval_steps_per_second": 4.332,
1104
+ "step": 1460
1105
+ },
1106
+ {
1107
+ "epoch": 0.92,
1108
+ "grad_norm": 0.31653299927711487,
1109
+ "learning_rate": 4.304002577483357e-06,
1110
+ "loss": 0.3646,
1111
+ "step": 1480
1112
+ },
1113
+ {
1114
+ "epoch": 0.92,
1115
+ "eval_loss": 0.3755495548248291,
1116
+ "eval_runtime": 165.5577,
1117
+ "eval_samples_per_second": 17.317,
1118
+ "eval_steps_per_second": 4.331,
1119
+ "step": 1480
1120
+ },
1121
+ {
1122
+ "epoch": 0.93,
1123
+ "grad_norm": 0.3237595856189728,
1124
+ "learning_rate": 3.13737778767923e-06,
1125
+ "loss": 0.3621,
1126
+ "step": 1500
1127
+ },
1128
+ {
1129
+ "epoch": 0.93,
1130
+ "eval_loss": 0.3754778504371643,
1131
+ "eval_runtime": 165.5564,
1132
+ "eval_samples_per_second": 17.317,
1133
+ "eval_steps_per_second": 4.331,
1134
+ "step": 1500
1135
+ },
1136
+ {
1137
+ "epoch": 0.94,
1138
+ "grad_norm": 0.41257408261299133,
1139
+ "learning_rate": 2.1523519216631094e-06,
1140
+ "loss": 0.3825,
1141
+ "step": 1520
1142
+ },
1143
+ {
1144
+ "epoch": 0.94,
1145
+ "eval_loss": 0.3754417300224304,
1146
+ "eval_runtime": 165.5324,
1147
+ "eval_samples_per_second": 17.32,
1148
+ "eval_steps_per_second": 4.331,
1149
+ "step": 1520
1150
+ }
1151
+ ],
1152
+ "logging_steps": 20,
1153
+ "max_steps": 1612,
1154
+ "num_input_tokens_seen": 0,
1155
+ "num_train_epochs": 1,
1156
+ "save_steps": 20,
1157
+ "total_flos": 7.848376780749537e+17,
1158
+ "train_batch_size": 8,
1159
+ "trial_name": null,
1160
+ "trial_params": null
1161
+ }
checkpoint-1520/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19cdc3b0645f297de07155d8f0cee10c20d51defbbc762523c42ad678ebd6dbd
3
+ size 5176
checkpoint-1540/README.md ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ base_model: mistralai/Mistral-7B-Instruct-v0.2
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+
201
+
202
+ ### Framework versions
203
+
204
+ - PEFT 0.8.2
checkpoint-1540/adapter_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.2",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "loftq_config": {},
12
+ "lora_alpha": 16,
13
+ "lora_dropout": 0.1,
14
+ "megatron_config": null,
15
+ "megatron_core": "megatron.core",
16
+ "modules_to_save": null,
17
+ "peft_type": "LORA",
18
+ "r": 8,
19
+ "rank_pattern": {},
20
+ "revision": null,
21
+ "target_modules": [
22
+ "v_proj",
23
+ "k_proj",
24
+ "q_proj",
25
+ "o_proj"
26
+ ],
27
+ "task_type": "CAUSAL_LM",
28
+ "use_rslora": false
29
+ }
checkpoint-1540/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:17a72d10f9fa75c9578ea582ce17e7b1f1df0409d6c636c2b3e71f0345d3cc90
3
+ size 27297032
checkpoint-1540/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:370de6bca2f9c5dbbe2342eb1078ff874afad96c19cd542fd613e6e379e7f35c
3
+ size 54678266
checkpoint-1540/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:959e54607cfbf00c10e9e0355fbcf0841eaa0ae0240205cdb890d7bf760633ba
3
+ size 14512
checkpoint-1540/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:090413aa08658517e33fd7bd136f03414977178a69f37ea012a04f5d1c8dbe35
3
+ size 14512
checkpoint-1540/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1681d4c860a3c5ce4bd87ba235bfe9fc1520c429d2434187ca819d9f22cc82cc
3
+ size 1000
checkpoint-1540/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
checkpoint-1540/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-1540/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
3
+ size 493443
checkpoint-1540/tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<unk>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<s>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "</s>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ }
29
+ },
30
+ "additional_special_tokens": [],
31
+ "bos_token": "<s>",
32
+ "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
33
+ "clean_up_tokenization_spaces": false,
34
+ "eos_token": "</s>",
35
+ "legacy": true,
36
+ "model_max_length": 1000000000000000019884624838656,
37
+ "pad_token": "</s>",
38
+ "sp_model_kwargs": {},
39
+ "spaces_between_special_tokens": false,
40
+ "tokenizer_class": "LlamaTokenizer",
41
+ "unk_token": "<unk>",
42
+ "use_default_system_prompt": false
43
+ }
checkpoint-1540/trainer_state.json ADDED
@@ -0,0 +1,1176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.37540262937545776,
3
+ "best_model_checkpoint": "./mistral/22-02-24-Weni-ZeroShot-3.3.3-Mistral-7b-Multilanguage-3.2.0_Zeroshot-2_max_steps-1612_batch_16_2024-02-22_ppid_1326/checkpoint-1540",
4
+ "epoch": 0.9547427154370738,
5
+ "eval_steps": 20,
6
+ "global_step": 1540,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.01,
13
+ "grad_norm": 1.6690024137496948,
14
+ "learning_rate": 2.3602484472049692e-05,
15
+ "loss": 1.8728,
16
+ "step": 20
17
+ },
18
+ {
19
+ "epoch": 0.01,
20
+ "eval_loss": 1.7905555963516235,
21
+ "eval_runtime": 165.4628,
22
+ "eval_samples_per_second": 17.327,
23
+ "eval_steps_per_second": 4.333,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 0.02,
28
+ "grad_norm": 1.7746976613998413,
29
+ "learning_rate": 4.8447204968944106e-05,
30
+ "loss": 1.4796,
31
+ "step": 40
32
+ },
33
+ {
34
+ "epoch": 0.02,
35
+ "eval_loss": 1.1875672340393066,
36
+ "eval_runtime": 165.7743,
37
+ "eval_samples_per_second": 17.295,
38
+ "eval_steps_per_second": 4.325,
39
+ "step": 40
40
+ },
41
+ {
42
+ "epoch": 0.04,
43
+ "grad_norm": 1.1963611841201782,
44
+ "learning_rate": 7.329192546583851e-05,
45
+ "loss": 0.8318,
46
+ "step": 60
47
+ },
48
+ {
49
+ "epoch": 0.04,
50
+ "eval_loss": 0.6325646638870239,
51
+ "eval_runtime": 165.8606,
52
+ "eval_samples_per_second": 17.286,
53
+ "eval_steps_per_second": 4.323,
54
+ "step": 60
55
+ },
56
+ {
57
+ "epoch": 0.05,
58
+ "grad_norm": 0.6274264454841614,
59
+ "learning_rate": 9.813664596273293e-05,
60
+ "loss": 0.5478,
61
+ "step": 80
62
+ },
63
+ {
64
+ "epoch": 0.05,
65
+ "eval_loss": 0.541927695274353,
66
+ "eval_runtime": 165.8755,
67
+ "eval_samples_per_second": 17.284,
68
+ "eval_steps_per_second": 4.323,
69
+ "step": 80
70
+ },
71
+ {
72
+ "epoch": 0.06,
73
+ "grad_norm": 0.7583674788475037,
74
+ "learning_rate": 0.00012298136645962735,
75
+ "loss": 0.517,
76
+ "step": 100
77
+ },
78
+ {
79
+ "epoch": 0.06,
80
+ "eval_loss": 0.5157255530357361,
81
+ "eval_runtime": 165.8227,
82
+ "eval_samples_per_second": 17.29,
83
+ "eval_steps_per_second": 4.324,
84
+ "step": 100
85
+ },
86
+ {
87
+ "epoch": 0.07,
88
+ "grad_norm": 0.496155321598053,
89
+ "learning_rate": 0.00014782608695652173,
90
+ "loss": 0.5109,
91
+ "step": 120
92
+ },
93
+ {
94
+ "epoch": 0.07,
95
+ "eval_loss": 0.49060019850730896,
96
+ "eval_runtime": 165.8171,
97
+ "eval_samples_per_second": 17.29,
98
+ "eval_steps_per_second": 4.324,
99
+ "step": 120
100
+ },
101
+ {
102
+ "epoch": 0.09,
103
+ "grad_norm": 0.3945171535015106,
104
+ "learning_rate": 0.00017142857142857143,
105
+ "loss": 0.4656,
106
+ "step": 140
107
+ },
108
+ {
109
+ "epoch": 0.09,
110
+ "eval_loss": 0.4657692313194275,
111
+ "eval_runtime": 165.7753,
112
+ "eval_samples_per_second": 17.294,
113
+ "eval_steps_per_second": 4.325,
114
+ "step": 140
115
+ },
116
+ {
117
+ "epoch": 0.1,
118
+ "grad_norm": 0.3318285346031189,
119
+ "learning_rate": 0.00019627329192546585,
120
+ "loss": 0.4409,
121
+ "step": 160
122
+ },
123
+ {
124
+ "epoch": 0.1,
125
+ "eval_loss": 0.45186159014701843,
126
+ "eval_runtime": 165.7746,
127
+ "eval_samples_per_second": 17.295,
128
+ "eval_steps_per_second": 4.325,
129
+ "step": 160
130
+ },
131
+ {
132
+ "epoch": 0.11,
133
+ "grad_norm": 0.4603807330131531,
134
+ "learning_rate": 0.00019993226958500473,
135
+ "loss": 0.4316,
136
+ "step": 180
137
+ },
138
+ {
139
+ "epoch": 0.11,
140
+ "eval_loss": 0.4474850594997406,
141
+ "eval_runtime": 165.6607,
142
+ "eval_samples_per_second": 17.306,
143
+ "eval_steps_per_second": 4.328,
144
+ "step": 180
145
+ },
146
+ {
147
+ "epoch": 0.12,
148
+ "grad_norm": 0.4568885862827301,
149
+ "learning_rate": 0.00019967929472585524,
150
+ "loss": 0.4297,
151
+ "step": 200
152
+ },
153
+ {
154
+ "epoch": 0.12,
155
+ "eval_loss": 0.4427547752857208,
156
+ "eval_runtime": 165.7208,
157
+ "eval_samples_per_second": 17.3,
158
+ "eval_steps_per_second": 4.327,
159
+ "step": 200
160
+ },
161
+ {
162
+ "epoch": 0.14,
163
+ "grad_norm": 0.384003221988678,
164
+ "learning_rate": 0.00019923944021970962,
165
+ "loss": 0.4226,
166
+ "step": 220
167
+ },
168
+ {
169
+ "epoch": 0.14,
170
+ "eval_loss": 0.4389376938343048,
171
+ "eval_runtime": 165.7023,
172
+ "eval_samples_per_second": 17.302,
173
+ "eval_steps_per_second": 4.327,
174
+ "step": 220
175
+ },
176
+ {
177
+ "epoch": 0.15,
178
+ "grad_norm": 0.30612272024154663,
179
+ "learning_rate": 0.00019861353070979048,
180
+ "loss": 0.4321,
181
+ "step": 240
182
+ },
183
+ {
184
+ "epoch": 0.15,
185
+ "eval_loss": 0.4359733462333679,
186
+ "eval_runtime": 165.7286,
187
+ "eval_samples_per_second": 17.299,
188
+ "eval_steps_per_second": 4.326,
189
+ "step": 240
190
+ },
191
+ {
192
+ "epoch": 0.16,
193
+ "grad_norm": 0.4149855971336365,
194
+ "learning_rate": 0.0001978027396569313,
195
+ "loss": 0.4261,
196
+ "step": 260
197
+ },
198
+ {
199
+ "epoch": 0.16,
200
+ "eval_loss": 0.4336954355239868,
201
+ "eval_runtime": 165.7171,
202
+ "eval_samples_per_second": 17.301,
203
+ "eval_steps_per_second": 4.327,
204
+ "step": 260
205
+ },
206
+ {
207
+ "epoch": 0.17,
208
+ "grad_norm": 0.28470170497894287,
209
+ "learning_rate": 0.00019680858713956126,
210
+ "loss": 0.4235,
211
+ "step": 280
212
+ },
213
+ {
214
+ "epoch": 0.17,
215
+ "eval_loss": 0.4306911528110504,
216
+ "eval_runtime": 165.7298,
217
+ "eval_samples_per_second": 17.299,
218
+ "eval_steps_per_second": 4.326,
219
+ "step": 280
220
+ },
221
+ {
222
+ "epoch": 0.19,
223
+ "grad_norm": 0.3317676782608032,
224
+ "learning_rate": 0.00019563293700384832,
225
+ "loss": 0.4279,
226
+ "step": 300
227
+ },
228
+ {
229
+ "epoch": 0.19,
230
+ "eval_loss": 0.4280063509941101,
231
+ "eval_runtime": 165.7359,
232
+ "eval_samples_per_second": 17.299,
233
+ "eval_steps_per_second": 4.326,
234
+ "step": 300
235
+ },
236
+ {
237
+ "epoch": 0.2,
238
+ "grad_norm": 0.3677004277706146,
239
+ "learning_rate": 0.0001942779933693437,
240
+ "loss": 0.419,
241
+ "step": 320
242
+ },
243
+ {
244
+ "epoch": 0.2,
245
+ "eval_loss": 0.425252765417099,
246
+ "eval_runtime": 165.7143,
247
+ "eval_samples_per_second": 17.301,
248
+ "eval_steps_per_second": 4.327,
249
+ "step": 320
250
+ },
251
+ {
252
+ "epoch": 0.21,
253
+ "grad_norm": 0.34667083621025085,
254
+ "learning_rate": 0.00019274629649667838,
255
+ "loss": 0.4129,
256
+ "step": 340
257
+ },
258
+ {
259
+ "epoch": 0.21,
260
+ "eval_loss": 0.423022985458374,
261
+ "eval_runtime": 165.6904,
262
+ "eval_samples_per_second": 17.303,
263
+ "eval_steps_per_second": 4.327,
264
+ "step": 340
265
+ },
266
+ {
267
+ "epoch": 0.22,
268
+ "grad_norm": 0.37289124727249146,
269
+ "learning_rate": 0.00019104071802505943,
270
+ "loss": 0.4097,
271
+ "step": 360
272
+ },
273
+ {
274
+ "epoch": 0.22,
275
+ "eval_loss": 0.4223038852214813,
276
+ "eval_runtime": 165.7184,
277
+ "eval_samples_per_second": 17.3,
278
+ "eval_steps_per_second": 4.327,
279
+ "step": 360
280
+ },
281
+ {
282
+ "epoch": 0.24,
283
+ "grad_norm": 0.9662333130836487,
284
+ "learning_rate": 0.00018926226943248415,
285
+ "loss": 0.4204,
286
+ "step": 380
287
+ },
288
+ {
289
+ "epoch": 0.24,
290
+ "eval_loss": 0.41996675729751587,
291
+ "eval_runtime": 165.7378,
292
+ "eval_samples_per_second": 17.298,
293
+ "eval_steps_per_second": 4.326,
294
+ "step": 380
295
+ },
296
+ {
297
+ "epoch": 0.25,
298
+ "grad_norm": 0.43084949254989624,
299
+ "learning_rate": 0.00018722711057125052,
300
+ "loss": 0.4042,
301
+ "step": 400
302
+ },
303
+ {
304
+ "epoch": 0.25,
305
+ "eval_loss": 0.4190637767314911,
306
+ "eval_runtime": 165.6378,
307
+ "eval_samples_per_second": 17.309,
308
+ "eval_steps_per_second": 4.329,
309
+ "step": 400
310
+ },
311
+ {
312
+ "epoch": 0.26,
313
+ "grad_norm": 0.3700352609157562,
314
+ "learning_rate": 0.00018502841753095908,
315
+ "loss": 0.4134,
316
+ "step": 420
317
+ },
318
+ {
319
+ "epoch": 0.26,
320
+ "eval_loss": 0.4176079034805298,
321
+ "eval_runtime": 165.7021,
322
+ "eval_samples_per_second": 17.302,
323
+ "eval_steps_per_second": 4.327,
324
+ "step": 420
325
+ },
326
+ {
327
+ "epoch": 0.27,
328
+ "grad_norm": 0.2829599976539612,
329
+ "learning_rate": 0.00018267031244128938,
330
+ "loss": 0.4006,
331
+ "step": 440
332
+ },
333
+ {
334
+ "epoch": 0.27,
335
+ "eval_loss": 0.4157721698284149,
336
+ "eval_runtime": 165.7174,
337
+ "eval_samples_per_second": 17.301,
338
+ "eval_steps_per_second": 4.327,
339
+ "step": 440
340
+ },
341
+ {
342
+ "epoch": 0.29,
343
+ "grad_norm": 0.2943759560585022,
344
+ "learning_rate": 0.00018015721629907882,
345
+ "loss": 0.4004,
346
+ "step": 460
347
+ },
348
+ {
349
+ "epoch": 0.29,
350
+ "eval_loss": 0.4140998125076294,
351
+ "eval_runtime": 165.7622,
352
+ "eval_samples_per_second": 17.296,
353
+ "eval_steps_per_second": 4.325,
354
+ "step": 460
355
+ },
356
+ {
357
+ "epoch": 0.3,
358
+ "grad_norm": 0.36071881651878357,
359
+ "learning_rate": 0.00017749384067979764,
360
+ "loss": 0.3967,
361
+ "step": 480
362
+ },
363
+ {
364
+ "epoch": 0.3,
365
+ "eval_loss": 0.4122526943683624,
366
+ "eval_runtime": 165.6609,
367
+ "eval_samples_per_second": 17.306,
368
+ "eval_steps_per_second": 4.328,
369
+ "step": 480
370
+ },
371
+ {
372
+ "epoch": 0.31,
373
+ "grad_norm": 0.3050592243671417,
374
+ "learning_rate": 0.00017468517890424455,
375
+ "loss": 0.4089,
376
+ "step": 500
377
+ },
378
+ {
379
+ "epoch": 0.31,
380
+ "eval_loss": 0.4099767506122589,
381
+ "eval_runtime": 165.7178,
382
+ "eval_samples_per_second": 17.3,
383
+ "eval_steps_per_second": 4.327,
384
+ "step": 500
385
+ },
386
+ {
387
+ "epoch": 0.32,
388
+ "grad_norm": 0.3138140141963959,
389
+ "learning_rate": 0.00017173649667702337,
390
+ "loss": 0.3924,
391
+ "step": 520
392
+ },
393
+ {
394
+ "epoch": 0.32,
395
+ "eval_loss": 0.40873026847839355,
396
+ "eval_runtime": 165.7231,
397
+ "eval_samples_per_second": 17.3,
398
+ "eval_steps_per_second": 4.326,
399
+ "step": 520
400
+ },
401
+ {
402
+ "epoch": 0.33,
403
+ "grad_norm": 0.299787700176239,
404
+ "learning_rate": 0.0001686533222143523,
405
+ "loss": 0.4118,
406
+ "step": 540
407
+ },
408
+ {
409
+ "epoch": 0.33,
410
+ "eval_loss": 0.40794187784194946,
411
+ "eval_runtime": 165.723,
412
+ "eval_samples_per_second": 17.3,
413
+ "eval_steps_per_second": 4.326,
414
+ "step": 540
415
+ },
416
+ {
417
+ "epoch": 0.35,
418
+ "grad_norm": 0.37338224053382874,
419
+ "learning_rate": 0.0001654414358797141,
420
+ "loss": 0.4027,
421
+ "step": 560
422
+ },
423
+ {
424
+ "epoch": 0.35,
425
+ "eval_loss": 0.4068893790245056,
426
+ "eval_runtime": 165.5504,
427
+ "eval_samples_per_second": 17.318,
428
+ "eval_steps_per_second": 4.331,
429
+ "step": 560
430
+ },
431
+ {
432
+ "epoch": 0.36,
433
+ "grad_norm": 0.3397510349750519,
434
+ "learning_rate": 0.00016210685934677782,
435
+ "loss": 0.393,
436
+ "step": 580
437
+ },
438
+ {
439
+ "epoch": 0.36,
440
+ "eval_loss": 0.40551885962486267,
441
+ "eval_runtime": 165.5533,
442
+ "eval_samples_per_second": 17.318,
443
+ "eval_steps_per_second": 4.331,
444
+ "step": 580
445
+ },
446
+ {
447
+ "epoch": 0.37,
448
+ "grad_norm": 0.3803115487098694,
449
+ "learning_rate": 0.00015883106145163397,
450
+ "loss": 0.4103,
451
+ "step": 600
452
+ },
453
+ {
454
+ "epoch": 0.37,
455
+ "eval_loss": 0.40474218130111694,
456
+ "eval_runtime": 165.6206,
457
+ "eval_samples_per_second": 17.311,
458
+ "eval_steps_per_second": 4.329,
459
+ "step": 600
460
+ },
461
+ {
462
+ "epoch": 0.38,
463
+ "grad_norm": 0.3368137776851654,
464
+ "learning_rate": 0.00015527541943543543,
465
+ "loss": 0.3896,
466
+ "step": 620
467
+ },
468
+ {
469
+ "epoch": 0.38,
470
+ "eval_loss": 0.40325844287872314,
471
+ "eval_runtime": 165.6617,
472
+ "eval_samples_per_second": 17.306,
473
+ "eval_steps_per_second": 4.328,
474
+ "step": 620
475
+ },
476
+ {
477
+ "epoch": 0.4,
478
+ "grad_norm": 0.32399722933769226,
479
+ "learning_rate": 0.00015161614656089196,
480
+ "loss": 0.3912,
481
+ "step": 640
482
+ },
483
+ {
484
+ "epoch": 0.4,
485
+ "eval_loss": 0.4015989899635315,
486
+ "eval_runtime": 165.6304,
487
+ "eval_samples_per_second": 17.31,
488
+ "eval_steps_per_second": 4.329,
489
+ "step": 640
490
+ },
491
+ {
492
+ "epoch": 0.41,
493
+ "grad_norm": 0.42368754744529724,
494
+ "learning_rate": 0.0001478601032660207,
495
+ "loss": 0.3897,
496
+ "step": 660
497
+ },
498
+ {
499
+ "epoch": 0.41,
500
+ "eval_loss": 0.40123647451400757,
501
+ "eval_runtime": 165.6021,
502
+ "eval_samples_per_second": 17.313,
503
+ "eval_steps_per_second": 4.33,
504
+ "step": 660
505
+ },
506
+ {
507
+ "epoch": 0.42,
508
+ "grad_norm": 0.36450713872909546,
509
+ "learning_rate": 0.00014401433141490152,
510
+ "loss": 0.3963,
511
+ "step": 680
512
+ },
513
+ {
514
+ "epoch": 0.42,
515
+ "eval_loss": 0.39942467212677,
516
+ "eval_runtime": 165.5741,
517
+ "eval_samples_per_second": 17.316,
518
+ "eval_steps_per_second": 4.33,
519
+ "step": 680
520
+ },
521
+ {
522
+ "epoch": 0.43,
523
+ "grad_norm": 0.3598613739013672,
524
+ "learning_rate": 0.00014008604109552665,
525
+ "loss": 0.3914,
526
+ "step": 700
527
+ },
528
+ {
529
+ "epoch": 0.43,
530
+ "eval_loss": 0.39811620116233826,
531
+ "eval_runtime": 165.6088,
532
+ "eval_samples_per_second": 17.312,
533
+ "eval_steps_per_second": 4.329,
534
+ "step": 700
535
+ },
536
+ {
537
+ "epoch": 0.45,
538
+ "grad_norm": 0.3440189063549042,
539
+ "learning_rate": 0.00013608259710226186,
540
+ "loss": 0.3769,
541
+ "step": 720
542
+ },
543
+ {
544
+ "epoch": 0.45,
545
+ "eval_loss": 0.396987646818161,
546
+ "eval_runtime": 165.6485,
547
+ "eval_samples_per_second": 17.308,
548
+ "eval_steps_per_second": 4.328,
549
+ "step": 720
550
+ },
551
+ {
552
+ "epoch": 0.46,
553
+ "grad_norm": 0.3481440544128418,
554
+ "learning_rate": 0.0001320115051282632,
555
+ "loss": 0.3904,
556
+ "step": 740
557
+ },
558
+ {
559
+ "epoch": 0.46,
560
+ "eval_loss": 0.3970092833042145,
561
+ "eval_runtime": 165.6528,
562
+ "eval_samples_per_second": 17.307,
563
+ "eval_steps_per_second": 4.328,
564
+ "step": 740
565
+ },
566
+ {
567
+ "epoch": 0.47,
568
+ "grad_norm": 0.4135587215423584,
569
+ "learning_rate": 0.0001278803976937355,
570
+ "loss": 0.3831,
571
+ "step": 760
572
+ },
573
+ {
574
+ "epoch": 0.47,
575
+ "eval_loss": 0.39514589309692383,
576
+ "eval_runtime": 165.7492,
577
+ "eval_samples_per_second": 17.297,
578
+ "eval_steps_per_second": 4.326,
579
+ "step": 760
580
+ },
581
+ {
582
+ "epoch": 0.48,
583
+ "grad_norm": 0.34816452860832214,
584
+ "learning_rate": 0.00012369701983641388,
585
+ "loss": 0.3922,
586
+ "step": 780
587
+ },
588
+ {
589
+ "epoch": 0.48,
590
+ "eval_loss": 0.3943038880825043,
591
+ "eval_runtime": 165.7169,
592
+ "eval_samples_per_second": 17.301,
593
+ "eval_steps_per_second": 4.327,
594
+ "step": 780
595
+ },
596
+ {
597
+ "epoch": 0.5,
598
+ "grad_norm": 0.4880768060684204,
599
+ "learning_rate": 0.0001194692145910969,
600
+ "loss": 0.403,
601
+ "step": 800
602
+ },
603
+ {
604
+ "epoch": 0.5,
605
+ "eval_loss": 0.3928041160106659,
606
+ "eval_runtime": 165.6712,
607
+ "eval_samples_per_second": 17.305,
608
+ "eval_steps_per_second": 4.328,
609
+ "step": 800
610
+ },
611
+ {
612
+ "epoch": 0.51,
613
+ "grad_norm": 0.3198924660682678,
614
+ "learning_rate": 0.00011520490828545361,
615
+ "loss": 0.3913,
616
+ "step": 820
617
+ },
618
+ {
619
+ "epoch": 0.51,
620
+ "eval_loss": 0.3922466039657593,
621
+ "eval_runtime": 165.7505,
622
+ "eval_samples_per_second": 17.297,
623
+ "eval_steps_per_second": 4.326,
624
+ "step": 820
625
+ },
626
+ {
627
+ "epoch": 0.52,
628
+ "grad_norm": 0.33648762106895447,
629
+ "learning_rate": 0.00011091209567967229,
630
+ "loss": 0.3836,
631
+ "step": 840
632
+ },
633
+ {
634
+ "epoch": 0.52,
635
+ "eval_loss": 0.39126288890838623,
636
+ "eval_runtime": 165.7535,
637
+ "eval_samples_per_second": 17.297,
638
+ "eval_steps_per_second": 4.326,
639
+ "step": 840
640
+ },
641
+ {
642
+ "epoch": 0.53,
643
+ "grad_norm": 0.2874184846878052,
644
+ "learning_rate": 0.00010659882497781187,
645
+ "loss": 0.3736,
646
+ "step": 860
647
+ },
648
+ {
649
+ "epoch": 0.53,
650
+ "eval_loss": 0.3903014063835144,
651
+ "eval_runtime": 165.7718,
652
+ "eval_samples_per_second": 17.295,
653
+ "eval_steps_per_second": 4.325,
654
+ "step": 860
655
+ },
656
+ {
657
+ "epoch": 0.55,
658
+ "grad_norm": 0.37889736890792847,
659
+ "learning_rate": 0.00010227318273895532,
660
+ "loss": 0.3773,
661
+ "step": 880
662
+ },
663
+ {
664
+ "epoch": 0.55,
665
+ "eval_loss": 0.38970035314559937,
666
+ "eval_runtime": 165.5823,
667
+ "eval_samples_per_second": 17.315,
668
+ "eval_steps_per_second": 4.33,
669
+ "step": 880
670
+ },
671
+ {
672
+ "epoch": 0.56,
673
+ "grad_norm": 0.32016921043395996,
674
+ "learning_rate": 9.794327871645574e-05,
675
+ "loss": 0.3883,
676
+ "step": 900
677
+ },
678
+ {
679
+ "epoch": 0.56,
680
+ "eval_loss": 0.38903746008872986,
681
+ "eval_runtime": 165.5591,
682
+ "eval_samples_per_second": 17.317,
683
+ "eval_steps_per_second": 4.331,
684
+ "step": 900
685
+ },
686
+ {
687
+ "epoch": 0.57,
688
+ "grad_norm": 0.34894031286239624,
689
+ "learning_rate": 9.361723065369682e-05,
690
+ "loss": 0.3751,
691
+ "step": 920
692
+ },
693
+ {
694
+ "epoch": 0.57,
695
+ "eval_loss": 0.3883580267429352,
696
+ "eval_runtime": 165.5778,
697
+ "eval_samples_per_second": 17.315,
698
+ "eval_steps_per_second": 4.33,
699
+ "step": 920
700
+ },
701
+ {
702
+ "epoch": 0.58,
703
+ "grad_norm": 0.35315269231796265,
704
+ "learning_rate": 8.930314906487384e-05,
705
+ "loss": 0.3832,
706
+ "step": 940
707
+ },
708
+ {
709
+ "epoch": 0.58,
710
+ "eval_loss": 0.3874415457248688,
711
+ "eval_runtime": 165.6643,
712
+ "eval_samples_per_second": 17.306,
713
+ "eval_steps_per_second": 4.328,
714
+ "step": 940
715
+ },
716
+ {
717
+ "epoch": 0.6,
718
+ "grad_norm": 0.4276430904865265,
719
+ "learning_rate": 8.500912202932824e-05,
720
+ "loss": 0.3726,
721
+ "step": 960
722
+ },
723
+ {
724
+ "epoch": 0.6,
725
+ "eval_loss": 0.3868561387062073,
726
+ "eval_runtime": 165.6292,
727
+ "eval_samples_per_second": 17.31,
728
+ "eval_steps_per_second": 4.329,
729
+ "step": 960
730
+ },
731
+ {
732
+ "epoch": 0.61,
733
+ "grad_norm": 0.32810911536216736,
734
+ "learning_rate": 8.07432000279427e-05,
735
+ "loss": 0.3738,
736
+ "step": 980
737
+ },
738
+ {
739
+ "epoch": 0.61,
740
+ "eval_loss": 0.38609209656715393,
741
+ "eval_runtime": 165.6113,
742
+ "eval_samples_per_second": 17.312,
743
+ "eval_steps_per_second": 4.329,
744
+ "step": 980
745
+ },
746
+ {
747
+ "epoch": 0.62,
748
+ "grad_norm": 0.3598964214324951,
749
+ "learning_rate": 7.651338085002669e-05,
750
+ "loss": 0.3809,
751
+ "step": 1000
752
+ },
753
+ {
754
+ "epoch": 0.62,
755
+ "eval_loss": 0.3854670822620392,
756
+ "eval_runtime": 165.6347,
757
+ "eval_samples_per_second": 17.309,
758
+ "eval_steps_per_second": 4.329,
759
+ "step": 1000
760
+ },
761
+ {
762
+ "epoch": 0.63,
763
+ "grad_norm": 0.32283729314804077,
764
+ "learning_rate": 7.232759459898832e-05,
765
+ "loss": 0.3871,
766
+ "step": 1020
767
+ },
768
+ {
769
+ "epoch": 0.63,
770
+ "eval_loss": 0.38449159264564514,
771
+ "eval_runtime": 165.5636,
772
+ "eval_samples_per_second": 17.317,
773
+ "eval_steps_per_second": 4.331,
774
+ "step": 1020
775
+ },
776
+ {
777
+ "epoch": 0.64,
778
+ "grad_norm": 0.3151933252811432,
779
+ "learning_rate": 6.819368882490458e-05,
780
+ "loss": 0.3799,
781
+ "step": 1040
782
+ },
783
+ {
784
+ "epoch": 0.64,
785
+ "eval_loss": 0.3837529420852661,
786
+ "eval_runtime": 165.6596,
787
+ "eval_samples_per_second": 17.307,
788
+ "eval_steps_per_second": 4.328,
789
+ "step": 1040
790
+ },
791
+ {
792
+ "epoch": 0.66,
793
+ "grad_norm": 0.37252795696258545,
794
+ "learning_rate": 6.411941381186302e-05,
795
+ "loss": 0.3882,
796
+ "step": 1060
797
+ },
798
+ {
799
+ "epoch": 0.66,
800
+ "eval_loss": 0.38311225175857544,
801
+ "eval_runtime": 165.5928,
802
+ "eval_samples_per_second": 17.314,
803
+ "eval_steps_per_second": 4.33,
804
+ "step": 1060
805
+ },
806
+ {
807
+ "epoch": 0.67,
808
+ "grad_norm": 0.33380192518234253,
809
+ "learning_rate": 6.01124080476589e-05,
810
+ "loss": 0.3846,
811
+ "step": 1080
812
+ },
813
+ {
814
+ "epoch": 0.67,
815
+ "eval_loss": 0.3823437988758087,
816
+ "eval_runtime": 165.6364,
817
+ "eval_samples_per_second": 17.309,
818
+ "eval_steps_per_second": 4.329,
819
+ "step": 1080
820
+ },
821
+ {
822
+ "epoch": 0.68,
823
+ "grad_norm": 0.3543049991130829,
824
+ "learning_rate": 5.6180183903088844e-05,
825
+ "loss": 0.3696,
826
+ "step": 1100
827
+ },
828
+ {
829
+ "epoch": 0.68,
830
+ "eval_loss": 0.3821370601654053,
831
+ "eval_runtime": 165.5383,
832
+ "eval_samples_per_second": 17.319,
833
+ "eval_steps_per_second": 4.331,
834
+ "step": 1100
835
+ },
836
+ {
837
+ "epoch": 0.69,
838
+ "grad_norm": 0.374683141708374,
839
+ "learning_rate": 5.233011354768991e-05,
840
+ "loss": 0.3791,
841
+ "step": 1120
842
+ },
843
+ {
844
+ "epoch": 0.69,
845
+ "eval_loss": 0.38156434893608093,
846
+ "eval_runtime": 165.6726,
847
+ "eval_samples_per_second": 17.305,
848
+ "eval_steps_per_second": 4.328,
849
+ "step": 1120
850
+ },
851
+ {
852
+ "epoch": 0.71,
853
+ "grad_norm": 0.3851562738418579,
854
+ "learning_rate": 4.8569415128328945e-05,
855
+ "loss": 0.3726,
856
+ "step": 1140
857
+ },
858
+ {
859
+ "epoch": 0.71,
860
+ "eval_loss": 0.38082343339920044,
861
+ "eval_runtime": 165.6253,
862
+ "eval_samples_per_second": 17.31,
863
+ "eval_steps_per_second": 4.329,
864
+ "step": 1140
865
+ },
866
+ {
867
+ "epoch": 0.72,
868
+ "grad_norm": 0.422851026058197,
869
+ "learning_rate": 4.490513923655564e-05,
870
+ "loss": 0.3698,
871
+ "step": 1160
872
+ },
873
+ {
874
+ "epoch": 0.72,
875
+ "eval_loss": 0.38037535548210144,
876
+ "eval_runtime": 165.6523,
877
+ "eval_samples_per_second": 17.307,
878
+ "eval_steps_per_second": 4.328,
879
+ "step": 1160
880
+ },
881
+ {
882
+ "epoch": 0.73,
883
+ "grad_norm": 0.3657631278038025,
884
+ "learning_rate": 4.134415569008935e-05,
885
+ "loss": 0.3777,
886
+ "step": 1180
887
+ },
888
+ {
889
+ "epoch": 0.73,
890
+ "eval_loss": 0.3799656629562378,
891
+ "eval_runtime": 165.615,
892
+ "eval_samples_per_second": 17.311,
893
+ "eval_steps_per_second": 4.329,
894
+ "step": 1180
895
+ },
896
+ {
897
+ "epoch": 0.74,
898
+ "grad_norm": 0.34044766426086426,
899
+ "learning_rate": 3.789314065322218e-05,
900
+ "loss": 0.3637,
901
+ "step": 1200
902
+ },
903
+ {
904
+ "epoch": 0.74,
905
+ "eval_loss": 0.3793714940547943,
906
+ "eval_runtime": 165.687,
907
+ "eval_samples_per_second": 17.304,
908
+ "eval_steps_per_second": 4.327,
909
+ "step": 1200
910
+ },
911
+ {
912
+ "epoch": 0.76,
913
+ "grad_norm": 0.327467679977417,
914
+ "learning_rate": 3.455856412028593e-05,
915
+ "loss": 0.3653,
916
+ "step": 1220
917
+ },
918
+ {
919
+ "epoch": 0.76,
920
+ "eval_loss": 0.3786996603012085,
921
+ "eval_runtime": 165.7148,
922
+ "eval_samples_per_second": 17.301,
923
+ "eval_steps_per_second": 4.327,
924
+ "step": 1220
925
+ },
926
+ {
927
+ "epoch": 0.77,
928
+ "grad_norm": 0.3492739796638489,
929
+ "learning_rate": 3.1346677785647704e-05,
930
+ "loss": 0.382,
931
+ "step": 1240
932
+ },
933
+ {
934
+ "epoch": 0.77,
935
+ "eval_loss": 0.3782605230808258,
936
+ "eval_runtime": 165.621,
937
+ "eval_samples_per_second": 17.311,
938
+ "eval_steps_per_second": 4.329,
939
+ "step": 1240
940
+ },
941
+ {
942
+ "epoch": 0.78,
943
+ "grad_norm": 0.3024798333644867,
944
+ "learning_rate": 2.826350332297667e-05,
945
+ "loss": 0.3587,
946
+ "step": 1260
947
+ },
948
+ {
949
+ "epoch": 0.78,
950
+ "eval_loss": 0.37805166840553284,
951
+ "eval_runtime": 165.623,
952
+ "eval_samples_per_second": 17.31,
953
+ "eval_steps_per_second": 4.329,
954
+ "step": 1260
955
+ },
956
+ {
957
+ "epoch": 0.79,
958
+ "grad_norm": 0.3727082312107086,
959
+ "learning_rate": 2.531482109575547e-05,
960
+ "loss": 0.3729,
961
+ "step": 1280
962
+ },
963
+ {
964
+ "epoch": 0.79,
965
+ "eval_loss": 0.3775557577610016,
966
+ "eval_runtime": 165.6074,
967
+ "eval_samples_per_second": 17.312,
968
+ "eval_steps_per_second": 4.33,
969
+ "step": 1280
970
+ },
971
+ {
972
+ "epoch": 0.81,
973
+ "grad_norm": 0.41581809520721436,
974
+ "learning_rate": 2.250615932020238e-05,
975
+ "loss": 0.3731,
976
+ "step": 1300
977
+ },
978
+ {
979
+ "epoch": 0.81,
980
+ "eval_loss": 0.37723448872566223,
981
+ "eval_runtime": 165.6275,
982
+ "eval_samples_per_second": 17.31,
983
+ "eval_steps_per_second": 4.329,
984
+ "step": 1300
985
+ },
986
+ {
987
+ "epoch": 0.82,
988
+ "grad_norm": 0.44623810052871704,
989
+ "learning_rate": 1.9842783700921196e-05,
990
+ "loss": 0.3757,
991
+ "step": 1320
992
+ },
993
+ {
994
+ "epoch": 0.82,
995
+ "eval_loss": 0.3769790530204773,
996
+ "eval_runtime": 165.5377,
997
+ "eval_samples_per_second": 17.319,
998
+ "eval_steps_per_second": 4.331,
999
+ "step": 1320
1000
+ },
1001
+ {
1002
+ "epoch": 0.83,
1003
+ "grad_norm": 0.365567147731781,
1004
+ "learning_rate": 1.732968755871063e-05,
1005
+ "loss": 0.3733,
1006
+ "step": 1340
1007
+ },
1008
+ {
1009
+ "epoch": 0.83,
1010
+ "eval_loss": 0.3767223656177521,
1011
+ "eval_runtime": 165.6665,
1012
+ "eval_samples_per_second": 17.306,
1013
+ "eval_steps_per_second": 4.328,
1014
+ "step": 1340
1015
+ },
1016
+ {
1017
+ "epoch": 0.84,
1018
+ "grad_norm": 0.4132380187511444,
1019
+ "learning_rate": 1.4971582469040957e-05,
1020
+ "loss": 0.3792,
1021
+ "step": 1360
1022
+ },
1023
+ {
1024
+ "epoch": 0.84,
1025
+ "eval_loss": 0.3763655126094818,
1026
+ "eval_runtime": 165.6456,
1027
+ "eval_samples_per_second": 17.308,
1028
+ "eval_steps_per_second": 4.329,
1029
+ "step": 1360
1030
+ },
1031
+ {
1032
+ "epoch": 0.86,
1033
+ "grad_norm": 0.33449500799179077,
1034
+ "learning_rate": 1.2772889428749524e-05,
1035
+ "loss": 0.3678,
1036
+ "step": 1380
1037
+ },
1038
+ {
1039
+ "epoch": 0.86,
1040
+ "eval_loss": 0.3761462867259979,
1041
+ "eval_runtime": 165.6026,
1042
+ "eval_samples_per_second": 17.313,
1043
+ "eval_steps_per_second": 4.33,
1044
+ "step": 1380
1045
+ },
1046
+ {
1047
+ "epoch": 0.87,
1048
+ "grad_norm": 0.28829070925712585,
1049
+ "learning_rate": 1.0737730567515847e-05,
1050
+ "loss": 0.3604,
1051
+ "step": 1400
1052
+ },
1053
+ {
1054
+ "epoch": 0.87,
1055
+ "eval_loss": 0.3759004473686218,
1056
+ "eval_runtime": 165.5228,
1057
+ "eval_samples_per_second": 17.321,
1058
+ "eval_steps_per_second": 4.332,
1059
+ "step": 1400
1060
+ },
1061
+ {
1062
+ "epoch": 0.88,
1063
+ "grad_norm": 0.4277011454105377,
1064
+ "learning_rate": 8.869921419655457e-06,
1065
+ "loss": 0.3496,
1066
+ "step": 1420
1067
+ },
1068
+ {
1069
+ "epoch": 0.88,
1070
+ "eval_loss": 0.3757947087287903,
1071
+ "eval_runtime": 165.6492,
1072
+ "eval_samples_per_second": 17.308,
1073
+ "eval_steps_per_second": 4.328,
1074
+ "step": 1420
1075
+ },
1076
+ {
1077
+ "epoch": 0.89,
1078
+ "grad_norm": 0.40312379598617554,
1079
+ "learning_rate": 7.172963770721341e-06,
1080
+ "loss": 0.3676,
1081
+ "step": 1440
1082
+ },
1083
+ {
1084
+ "epoch": 0.89,
1085
+ "eval_loss": 0.3757094442844391,
1086
+ "eval_runtime": 165.5144,
1087
+ "eval_samples_per_second": 17.322,
1088
+ "eval_steps_per_second": 4.332,
1089
+ "step": 1440
1090
+ },
1091
+ {
1092
+ "epoch": 0.91,
1093
+ "grad_norm": 0.45307889580726624,
1094
+ "learning_rate": 5.650039092324766e-06,
1095
+ "loss": 0.3678,
1096
+ "step": 1460
1097
+ },
1098
+ {
1099
+ "epoch": 0.91,
1100
+ "eval_loss": 0.37566059827804565,
1101
+ "eval_runtime": 165.5183,
1102
+ "eval_samples_per_second": 17.321,
1103
+ "eval_steps_per_second": 4.332,
1104
+ "step": 1460
1105
+ },
1106
+ {
1107
+ "epoch": 0.92,
1108
+ "grad_norm": 0.31653299927711487,
1109
+ "learning_rate": 4.304002577483357e-06,
1110
+ "loss": 0.3646,
1111
+ "step": 1480
1112
+ },
1113
+ {
1114
+ "epoch": 0.92,
1115
+ "eval_loss": 0.3755495548248291,
1116
+ "eval_runtime": 165.5577,
1117
+ "eval_samples_per_second": 17.317,
1118
+ "eval_steps_per_second": 4.331,
1119
+ "step": 1480
1120
+ },
1121
+ {
1122
+ "epoch": 0.93,
1123
+ "grad_norm": 0.3237595856189728,
1124
+ "learning_rate": 3.13737778767923e-06,
1125
+ "loss": 0.3621,
1126
+ "step": 1500
1127
+ },
1128
+ {
1129
+ "epoch": 0.93,
1130
+ "eval_loss": 0.3754778504371643,
1131
+ "eval_runtime": 165.5564,
1132
+ "eval_samples_per_second": 17.317,
1133
+ "eval_steps_per_second": 4.331,
1134
+ "step": 1500
1135
+ },
1136
+ {
1137
+ "epoch": 0.94,
1138
+ "grad_norm": 0.41257408261299133,
1139
+ "learning_rate": 2.1523519216631094e-06,
1140
+ "loss": 0.3825,
1141
+ "step": 1520
1142
+ },
1143
+ {
1144
+ "epoch": 0.94,
1145
+ "eval_loss": 0.3754417300224304,
1146
+ "eval_runtime": 165.5324,
1147
+ "eval_samples_per_second": 17.32,
1148
+ "eval_steps_per_second": 4.331,
1149
+ "step": 1520
1150
+ },
1151
+ {
1152
+ "epoch": 0.95,
1153
+ "grad_norm": 0.3975638747215271,
1154
+ "learning_rate": 1.350771714874166e-06,
1155
+ "loss": 0.3718,
1156
+ "step": 1540
1157
+ },
1158
+ {
1159
+ "epoch": 0.95,
1160
+ "eval_loss": 0.37540262937545776,
1161
+ "eval_runtime": 165.5691,
1162
+ "eval_samples_per_second": 17.316,
1163
+ "eval_steps_per_second": 4.331,
1164
+ "step": 1540
1165
+ }
1166
+ ],
1167
+ "logging_steps": 20,
1168
+ "max_steps": 1612,
1169
+ "num_input_tokens_seen": 0,
1170
+ "num_train_epochs": 1,
1171
+ "save_steps": 20,
1172
+ "total_flos": 7.950352554330685e+17,
1173
+ "train_batch_size": 8,
1174
+ "trial_name": null,
1175
+ "trial_params": null
1176
+ }
checkpoint-1540/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19cdc3b0645f297de07155d8f0cee10c20d51defbbc762523c42ad678ebd6dbd
3
+ size 5176
checkpoint-1560/README.md ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ base_model: mistralai/Mistral-7B-Instruct-v0.2
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+
201
+
202
+ ### Framework versions
203
+
204
+ - PEFT 0.8.2
checkpoint-1560/adapter_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.2",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "loftq_config": {},
12
+ "lora_alpha": 16,
13
+ "lora_dropout": 0.1,
14
+ "megatron_config": null,
15
+ "megatron_core": "megatron.core",
16
+ "modules_to_save": null,
17
+ "peft_type": "LORA",
18
+ "r": 8,
19
+ "rank_pattern": {},
20
+ "revision": null,
21
+ "target_modules": [
22
+ "v_proj",
23
+ "k_proj",
24
+ "q_proj",
25
+ "o_proj"
26
+ ],
27
+ "task_type": "CAUSAL_LM",
28
+ "use_rslora": false
29
+ }
checkpoint-1560/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16b69a36f9d96bfa0da891b712cb263ca17c339b5297372df9e28ba23a58bd8b
3
+ size 27297032
checkpoint-1560/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad4b3bca7f4e90962faaa92bc773fcf77b742d81d70de6071bb9a45ebcad5acb
3
+ size 54678266
checkpoint-1560/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b6f3359aa7364fe2693f1ca4441a4caf3e6abbc3936fcd5626f390f5b0c8188
3
+ size 14512
checkpoint-1560/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bcb6dc404981179f74dfe8d839f63677af6a56077cc2703fa12e3c1b267e080c
3
+ size 14512
checkpoint-1560/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f9dfbdfed60fde5c5c1d041621c1dffcb5a726ce0554cd12e0f0d8148be07c2
3
+ size 1000
checkpoint-1560/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
checkpoint-1560/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-1560/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
3
+ size 493443
checkpoint-1560/tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<unk>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<s>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "</s>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ }
29
+ },
30
+ "additional_special_tokens": [],
31
+ "bos_token": "<s>",
32
+ "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
33
+ "clean_up_tokenization_spaces": false,
34
+ "eos_token": "</s>",
35
+ "legacy": true,
36
+ "model_max_length": 1000000000000000019884624838656,
37
+ "pad_token": "</s>",
38
+ "sp_model_kwargs": {},
39
+ "spaces_between_special_tokens": false,
40
+ "tokenizer_class": "LlamaTokenizer",
41
+ "unk_token": "<unk>",
42
+ "use_default_system_prompt": false
43
+ }
checkpoint-1560/trainer_state.json ADDED
@@ -0,0 +1,1191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.37537458539009094,
3
+ "best_model_checkpoint": "./mistral/22-02-24-Weni-ZeroShot-3.3.3-Mistral-7b-Multilanguage-3.2.0_Zeroshot-2_max_steps-1612_batch_16_2024-02-22_ppid_1326/checkpoint-1560",
4
+ "epoch": 0.9671419714817111,
5
+ "eval_steps": 20,
6
+ "global_step": 1560,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.01,
13
+ "grad_norm": 1.6690024137496948,
14
+ "learning_rate": 2.3602484472049692e-05,
15
+ "loss": 1.8728,
16
+ "step": 20
17
+ },
18
+ {
19
+ "epoch": 0.01,
20
+ "eval_loss": 1.7905555963516235,
21
+ "eval_runtime": 165.4628,
22
+ "eval_samples_per_second": 17.327,
23
+ "eval_steps_per_second": 4.333,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 0.02,
28
+ "grad_norm": 1.7746976613998413,
29
+ "learning_rate": 4.8447204968944106e-05,
30
+ "loss": 1.4796,
31
+ "step": 40
32
+ },
33
+ {
34
+ "epoch": 0.02,
35
+ "eval_loss": 1.1875672340393066,
36
+ "eval_runtime": 165.7743,
37
+ "eval_samples_per_second": 17.295,
38
+ "eval_steps_per_second": 4.325,
39
+ "step": 40
40
+ },
41
+ {
42
+ "epoch": 0.04,
43
+ "grad_norm": 1.1963611841201782,
44
+ "learning_rate": 7.329192546583851e-05,
45
+ "loss": 0.8318,
46
+ "step": 60
47
+ },
48
+ {
49
+ "epoch": 0.04,
50
+ "eval_loss": 0.6325646638870239,
51
+ "eval_runtime": 165.8606,
52
+ "eval_samples_per_second": 17.286,
53
+ "eval_steps_per_second": 4.323,
54
+ "step": 60
55
+ },
56
+ {
57
+ "epoch": 0.05,
58
+ "grad_norm": 0.6274264454841614,
59
+ "learning_rate": 9.813664596273293e-05,
60
+ "loss": 0.5478,
61
+ "step": 80
62
+ },
63
+ {
64
+ "epoch": 0.05,
65
+ "eval_loss": 0.541927695274353,
66
+ "eval_runtime": 165.8755,
67
+ "eval_samples_per_second": 17.284,
68
+ "eval_steps_per_second": 4.323,
69
+ "step": 80
70
+ },
71
+ {
72
+ "epoch": 0.06,
73
+ "grad_norm": 0.7583674788475037,
74
+ "learning_rate": 0.00012298136645962735,
75
+ "loss": 0.517,
76
+ "step": 100
77
+ },
78
+ {
79
+ "epoch": 0.06,
80
+ "eval_loss": 0.5157255530357361,
81
+ "eval_runtime": 165.8227,
82
+ "eval_samples_per_second": 17.29,
83
+ "eval_steps_per_second": 4.324,
84
+ "step": 100
85
+ },
86
+ {
87
+ "epoch": 0.07,
88
+ "grad_norm": 0.496155321598053,
89
+ "learning_rate": 0.00014782608695652173,
90
+ "loss": 0.5109,
91
+ "step": 120
92
+ },
93
+ {
94
+ "epoch": 0.07,
95
+ "eval_loss": 0.49060019850730896,
96
+ "eval_runtime": 165.8171,
97
+ "eval_samples_per_second": 17.29,
98
+ "eval_steps_per_second": 4.324,
99
+ "step": 120
100
+ },
101
+ {
102
+ "epoch": 0.09,
103
+ "grad_norm": 0.3945171535015106,
104
+ "learning_rate": 0.00017142857142857143,
105
+ "loss": 0.4656,
106
+ "step": 140
107
+ },
108
+ {
109
+ "epoch": 0.09,
110
+ "eval_loss": 0.4657692313194275,
111
+ "eval_runtime": 165.7753,
112
+ "eval_samples_per_second": 17.294,
113
+ "eval_steps_per_second": 4.325,
114
+ "step": 140
115
+ },
116
+ {
117
+ "epoch": 0.1,
118
+ "grad_norm": 0.3318285346031189,
119
+ "learning_rate": 0.00019627329192546585,
120
+ "loss": 0.4409,
121
+ "step": 160
122
+ },
123
+ {
124
+ "epoch": 0.1,
125
+ "eval_loss": 0.45186159014701843,
126
+ "eval_runtime": 165.7746,
127
+ "eval_samples_per_second": 17.295,
128
+ "eval_steps_per_second": 4.325,
129
+ "step": 160
130
+ },
131
+ {
132
+ "epoch": 0.11,
133
+ "grad_norm": 0.4603807330131531,
134
+ "learning_rate": 0.00019993226958500473,
135
+ "loss": 0.4316,
136
+ "step": 180
137
+ },
138
+ {
139
+ "epoch": 0.11,
140
+ "eval_loss": 0.4474850594997406,
141
+ "eval_runtime": 165.6607,
142
+ "eval_samples_per_second": 17.306,
143
+ "eval_steps_per_second": 4.328,
144
+ "step": 180
145
+ },
146
+ {
147
+ "epoch": 0.12,
148
+ "grad_norm": 0.4568885862827301,
149
+ "learning_rate": 0.00019967929472585524,
150
+ "loss": 0.4297,
151
+ "step": 200
152
+ },
153
+ {
154
+ "epoch": 0.12,
155
+ "eval_loss": 0.4427547752857208,
156
+ "eval_runtime": 165.7208,
157
+ "eval_samples_per_second": 17.3,
158
+ "eval_steps_per_second": 4.327,
159
+ "step": 200
160
+ },
161
+ {
162
+ "epoch": 0.14,
163
+ "grad_norm": 0.384003221988678,
164
+ "learning_rate": 0.00019923944021970962,
165
+ "loss": 0.4226,
166
+ "step": 220
167
+ },
168
+ {
169
+ "epoch": 0.14,
170
+ "eval_loss": 0.4389376938343048,
171
+ "eval_runtime": 165.7023,
172
+ "eval_samples_per_second": 17.302,
173
+ "eval_steps_per_second": 4.327,
174
+ "step": 220
175
+ },
176
+ {
177
+ "epoch": 0.15,
178
+ "grad_norm": 0.30612272024154663,
179
+ "learning_rate": 0.00019861353070979048,
180
+ "loss": 0.4321,
181
+ "step": 240
182
+ },
183
+ {
184
+ "epoch": 0.15,
185
+ "eval_loss": 0.4359733462333679,
186
+ "eval_runtime": 165.7286,
187
+ "eval_samples_per_second": 17.299,
188
+ "eval_steps_per_second": 4.326,
189
+ "step": 240
190
+ },
191
+ {
192
+ "epoch": 0.16,
193
+ "grad_norm": 0.4149855971336365,
194
+ "learning_rate": 0.0001978027396569313,
195
+ "loss": 0.4261,
196
+ "step": 260
197
+ },
198
+ {
199
+ "epoch": 0.16,
200
+ "eval_loss": 0.4336954355239868,
201
+ "eval_runtime": 165.7171,
202
+ "eval_samples_per_second": 17.301,
203
+ "eval_steps_per_second": 4.327,
204
+ "step": 260
205
+ },
206
+ {
207
+ "epoch": 0.17,
208
+ "grad_norm": 0.28470170497894287,
209
+ "learning_rate": 0.00019680858713956126,
210
+ "loss": 0.4235,
211
+ "step": 280
212
+ },
213
+ {
214
+ "epoch": 0.17,
215
+ "eval_loss": 0.4306911528110504,
216
+ "eval_runtime": 165.7298,
217
+ "eval_samples_per_second": 17.299,
218
+ "eval_steps_per_second": 4.326,
219
+ "step": 280
220
+ },
221
+ {
222
+ "epoch": 0.19,
223
+ "grad_norm": 0.3317676782608032,
224
+ "learning_rate": 0.00019563293700384832,
225
+ "loss": 0.4279,
226
+ "step": 300
227
+ },
228
+ {
229
+ "epoch": 0.19,
230
+ "eval_loss": 0.4280063509941101,
231
+ "eval_runtime": 165.7359,
232
+ "eval_samples_per_second": 17.299,
233
+ "eval_steps_per_second": 4.326,
234
+ "step": 300
235
+ },
236
+ {
237
+ "epoch": 0.2,
238
+ "grad_norm": 0.3677004277706146,
239
+ "learning_rate": 0.0001942779933693437,
240
+ "loss": 0.419,
241
+ "step": 320
242
+ },
243
+ {
244
+ "epoch": 0.2,
245
+ "eval_loss": 0.425252765417099,
246
+ "eval_runtime": 165.7143,
247
+ "eval_samples_per_second": 17.301,
248
+ "eval_steps_per_second": 4.327,
249
+ "step": 320
250
+ },
251
+ {
252
+ "epoch": 0.21,
253
+ "grad_norm": 0.34667083621025085,
254
+ "learning_rate": 0.00019274629649667838,
255
+ "loss": 0.4129,
256
+ "step": 340
257
+ },
258
+ {
259
+ "epoch": 0.21,
260
+ "eval_loss": 0.423022985458374,
261
+ "eval_runtime": 165.6904,
262
+ "eval_samples_per_second": 17.303,
263
+ "eval_steps_per_second": 4.327,
264
+ "step": 340
265
+ },
266
+ {
267
+ "epoch": 0.22,
268
+ "grad_norm": 0.37289124727249146,
269
+ "learning_rate": 0.00019104071802505943,
270
+ "loss": 0.4097,
271
+ "step": 360
272
+ },
273
+ {
274
+ "epoch": 0.22,
275
+ "eval_loss": 0.4223038852214813,
276
+ "eval_runtime": 165.7184,
277
+ "eval_samples_per_second": 17.3,
278
+ "eval_steps_per_second": 4.327,
279
+ "step": 360
280
+ },
281
+ {
282
+ "epoch": 0.24,
283
+ "grad_norm": 0.9662333130836487,
284
+ "learning_rate": 0.00018926226943248415,
285
+ "loss": 0.4204,
286
+ "step": 380
287
+ },
288
+ {
289
+ "epoch": 0.24,
290
+ "eval_loss": 0.41996675729751587,
291
+ "eval_runtime": 165.7378,
292
+ "eval_samples_per_second": 17.298,
293
+ "eval_steps_per_second": 4.326,
294
+ "step": 380
295
+ },
296
+ {
297
+ "epoch": 0.25,
298
+ "grad_norm": 0.43084949254989624,
299
+ "learning_rate": 0.00018722711057125052,
300
+ "loss": 0.4042,
301
+ "step": 400
302
+ },
303
+ {
304
+ "epoch": 0.25,
305
+ "eval_loss": 0.4190637767314911,
306
+ "eval_runtime": 165.6378,
307
+ "eval_samples_per_second": 17.309,
308
+ "eval_steps_per_second": 4.329,
309
+ "step": 400
310
+ },
311
+ {
312
+ "epoch": 0.26,
313
+ "grad_norm": 0.3700352609157562,
314
+ "learning_rate": 0.00018502841753095908,
315
+ "loss": 0.4134,
316
+ "step": 420
317
+ },
318
+ {
319
+ "epoch": 0.26,
320
+ "eval_loss": 0.4176079034805298,
321
+ "eval_runtime": 165.7021,
322
+ "eval_samples_per_second": 17.302,
323
+ "eval_steps_per_second": 4.327,
324
+ "step": 420
325
+ },
326
+ {
327
+ "epoch": 0.27,
328
+ "grad_norm": 0.2829599976539612,
329
+ "learning_rate": 0.00018267031244128938,
330
+ "loss": 0.4006,
331
+ "step": 440
332
+ },
333
+ {
334
+ "epoch": 0.27,
335
+ "eval_loss": 0.4157721698284149,
336
+ "eval_runtime": 165.7174,
337
+ "eval_samples_per_second": 17.301,
338
+ "eval_steps_per_second": 4.327,
339
+ "step": 440
340
+ },
341
+ {
342
+ "epoch": 0.29,
343
+ "grad_norm": 0.2943759560585022,
344
+ "learning_rate": 0.00018015721629907882,
345
+ "loss": 0.4004,
346
+ "step": 460
347
+ },
348
+ {
349
+ "epoch": 0.29,
350
+ "eval_loss": 0.4140998125076294,
351
+ "eval_runtime": 165.7622,
352
+ "eval_samples_per_second": 17.296,
353
+ "eval_steps_per_second": 4.325,
354
+ "step": 460
355
+ },
356
+ {
357
+ "epoch": 0.3,
358
+ "grad_norm": 0.36071881651878357,
359
+ "learning_rate": 0.00017749384067979764,
360
+ "loss": 0.3967,
361
+ "step": 480
362
+ },
363
+ {
364
+ "epoch": 0.3,
365
+ "eval_loss": 0.4122526943683624,
366
+ "eval_runtime": 165.6609,
367
+ "eval_samples_per_second": 17.306,
368
+ "eval_steps_per_second": 4.328,
369
+ "step": 480
370
+ },
371
+ {
372
+ "epoch": 0.31,
373
+ "grad_norm": 0.3050592243671417,
374
+ "learning_rate": 0.00017468517890424455,
375
+ "loss": 0.4089,
376
+ "step": 500
377
+ },
378
+ {
379
+ "epoch": 0.31,
380
+ "eval_loss": 0.4099767506122589,
381
+ "eval_runtime": 165.7178,
382
+ "eval_samples_per_second": 17.3,
383
+ "eval_steps_per_second": 4.327,
384
+ "step": 500
385
+ },
386
+ {
387
+ "epoch": 0.32,
388
+ "grad_norm": 0.3138140141963959,
389
+ "learning_rate": 0.00017173649667702337,
390
+ "loss": 0.3924,
391
+ "step": 520
392
+ },
393
+ {
394
+ "epoch": 0.32,
395
+ "eval_loss": 0.40873026847839355,
396
+ "eval_runtime": 165.7231,
397
+ "eval_samples_per_second": 17.3,
398
+ "eval_steps_per_second": 4.326,
399
+ "step": 520
400
+ },
401
+ {
402
+ "epoch": 0.33,
403
+ "grad_norm": 0.299787700176239,
404
+ "learning_rate": 0.0001686533222143523,
405
+ "loss": 0.4118,
406
+ "step": 540
407
+ },
408
+ {
409
+ "epoch": 0.33,
410
+ "eval_loss": 0.40794187784194946,
411
+ "eval_runtime": 165.723,
412
+ "eval_samples_per_second": 17.3,
413
+ "eval_steps_per_second": 4.326,
414
+ "step": 540
415
+ },
416
+ {
417
+ "epoch": 0.35,
418
+ "grad_norm": 0.37338224053382874,
419
+ "learning_rate": 0.0001654414358797141,
420
+ "loss": 0.4027,
421
+ "step": 560
422
+ },
423
+ {
424
+ "epoch": 0.35,
425
+ "eval_loss": 0.4068893790245056,
426
+ "eval_runtime": 165.5504,
427
+ "eval_samples_per_second": 17.318,
428
+ "eval_steps_per_second": 4.331,
429
+ "step": 560
430
+ },
431
+ {
432
+ "epoch": 0.36,
433
+ "grad_norm": 0.3397510349750519,
434
+ "learning_rate": 0.00016210685934677782,
435
+ "loss": 0.393,
436
+ "step": 580
437
+ },
438
+ {
439
+ "epoch": 0.36,
440
+ "eval_loss": 0.40551885962486267,
441
+ "eval_runtime": 165.5533,
442
+ "eval_samples_per_second": 17.318,
443
+ "eval_steps_per_second": 4.331,
444
+ "step": 580
445
+ },
446
+ {
447
+ "epoch": 0.37,
448
+ "grad_norm": 0.3803115487098694,
449
+ "learning_rate": 0.00015883106145163397,
450
+ "loss": 0.4103,
451
+ "step": 600
452
+ },
453
+ {
454
+ "epoch": 0.37,
455
+ "eval_loss": 0.40474218130111694,
456
+ "eval_runtime": 165.6206,
457
+ "eval_samples_per_second": 17.311,
458
+ "eval_steps_per_second": 4.329,
459
+ "step": 600
460
+ },
461
+ {
462
+ "epoch": 0.38,
463
+ "grad_norm": 0.3368137776851654,
464
+ "learning_rate": 0.00015527541943543543,
465
+ "loss": 0.3896,
466
+ "step": 620
467
+ },
468
+ {
469
+ "epoch": 0.38,
470
+ "eval_loss": 0.40325844287872314,
471
+ "eval_runtime": 165.6617,
472
+ "eval_samples_per_second": 17.306,
473
+ "eval_steps_per_second": 4.328,
474
+ "step": 620
475
+ },
476
+ {
477
+ "epoch": 0.4,
478
+ "grad_norm": 0.32399722933769226,
479
+ "learning_rate": 0.00015161614656089196,
480
+ "loss": 0.3912,
481
+ "step": 640
482
+ },
483
+ {
484
+ "epoch": 0.4,
485
+ "eval_loss": 0.4015989899635315,
486
+ "eval_runtime": 165.6304,
487
+ "eval_samples_per_second": 17.31,
488
+ "eval_steps_per_second": 4.329,
489
+ "step": 640
490
+ },
491
+ {
492
+ "epoch": 0.41,
493
+ "grad_norm": 0.42368754744529724,
494
+ "learning_rate": 0.0001478601032660207,
495
+ "loss": 0.3897,
496
+ "step": 660
497
+ },
498
+ {
499
+ "epoch": 0.41,
500
+ "eval_loss": 0.40123647451400757,
501
+ "eval_runtime": 165.6021,
502
+ "eval_samples_per_second": 17.313,
503
+ "eval_steps_per_second": 4.33,
504
+ "step": 660
505
+ },
506
+ {
507
+ "epoch": 0.42,
508
+ "grad_norm": 0.36450713872909546,
509
+ "learning_rate": 0.00014401433141490152,
510
+ "loss": 0.3963,
511
+ "step": 680
512
+ },
513
+ {
514
+ "epoch": 0.42,
515
+ "eval_loss": 0.39942467212677,
516
+ "eval_runtime": 165.5741,
517
+ "eval_samples_per_second": 17.316,
518
+ "eval_steps_per_second": 4.33,
519
+ "step": 680
520
+ },
521
+ {
522
+ "epoch": 0.43,
523
+ "grad_norm": 0.3598613739013672,
524
+ "learning_rate": 0.00014008604109552665,
525
+ "loss": 0.3914,
526
+ "step": 700
527
+ },
528
+ {
529
+ "epoch": 0.43,
530
+ "eval_loss": 0.39811620116233826,
531
+ "eval_runtime": 165.6088,
532
+ "eval_samples_per_second": 17.312,
533
+ "eval_steps_per_second": 4.329,
534
+ "step": 700
535
+ },
536
+ {
537
+ "epoch": 0.45,
538
+ "grad_norm": 0.3440189063549042,
539
+ "learning_rate": 0.00013608259710226186,
540
+ "loss": 0.3769,
541
+ "step": 720
542
+ },
543
+ {
544
+ "epoch": 0.45,
545
+ "eval_loss": 0.396987646818161,
546
+ "eval_runtime": 165.6485,
547
+ "eval_samples_per_second": 17.308,
548
+ "eval_steps_per_second": 4.328,
549
+ "step": 720
550
+ },
551
+ {
552
+ "epoch": 0.46,
553
+ "grad_norm": 0.3481440544128418,
554
+ "learning_rate": 0.0001320115051282632,
555
+ "loss": 0.3904,
556
+ "step": 740
557
+ },
558
+ {
559
+ "epoch": 0.46,
560
+ "eval_loss": 0.3970092833042145,
561
+ "eval_runtime": 165.6528,
562
+ "eval_samples_per_second": 17.307,
563
+ "eval_steps_per_second": 4.328,
564
+ "step": 740
565
+ },
566
+ {
567
+ "epoch": 0.47,
568
+ "grad_norm": 0.4135587215423584,
569
+ "learning_rate": 0.0001278803976937355,
570
+ "loss": 0.3831,
571
+ "step": 760
572
+ },
573
+ {
574
+ "epoch": 0.47,
575
+ "eval_loss": 0.39514589309692383,
576
+ "eval_runtime": 165.7492,
577
+ "eval_samples_per_second": 17.297,
578
+ "eval_steps_per_second": 4.326,
579
+ "step": 760
580
+ },
581
+ {
582
+ "epoch": 0.48,
583
+ "grad_norm": 0.34816452860832214,
584
+ "learning_rate": 0.00012369701983641388,
585
+ "loss": 0.3922,
586
+ "step": 780
587
+ },
588
+ {
589
+ "epoch": 0.48,
590
+ "eval_loss": 0.3943038880825043,
591
+ "eval_runtime": 165.7169,
592
+ "eval_samples_per_second": 17.301,
593
+ "eval_steps_per_second": 4.327,
594
+ "step": 780
595
+ },
596
+ {
597
+ "epoch": 0.5,
598
+ "grad_norm": 0.4880768060684204,
599
+ "learning_rate": 0.0001194692145910969,
600
+ "loss": 0.403,
601
+ "step": 800
602
+ },
603
+ {
604
+ "epoch": 0.5,
605
+ "eval_loss": 0.3928041160106659,
606
+ "eval_runtime": 165.6712,
607
+ "eval_samples_per_second": 17.305,
608
+ "eval_steps_per_second": 4.328,
609
+ "step": 800
610
+ },
611
+ {
612
+ "epoch": 0.51,
613
+ "grad_norm": 0.3198924660682678,
614
+ "learning_rate": 0.00011520490828545361,
615
+ "loss": 0.3913,
616
+ "step": 820
617
+ },
618
+ {
619
+ "epoch": 0.51,
620
+ "eval_loss": 0.3922466039657593,
621
+ "eval_runtime": 165.7505,
622
+ "eval_samples_per_second": 17.297,
623
+ "eval_steps_per_second": 4.326,
624
+ "step": 820
625
+ },
626
+ {
627
+ "epoch": 0.52,
628
+ "grad_norm": 0.33648762106895447,
629
+ "learning_rate": 0.00011091209567967229,
630
+ "loss": 0.3836,
631
+ "step": 840
632
+ },
633
+ {
634
+ "epoch": 0.52,
635
+ "eval_loss": 0.39126288890838623,
636
+ "eval_runtime": 165.7535,
637
+ "eval_samples_per_second": 17.297,
638
+ "eval_steps_per_second": 4.326,
639
+ "step": 840
640
+ },
641
+ {
642
+ "epoch": 0.53,
643
+ "grad_norm": 0.2874184846878052,
644
+ "learning_rate": 0.00010659882497781187,
645
+ "loss": 0.3736,
646
+ "step": 860
647
+ },
648
+ {
649
+ "epoch": 0.53,
650
+ "eval_loss": 0.3903014063835144,
651
+ "eval_runtime": 165.7718,
652
+ "eval_samples_per_second": 17.295,
653
+ "eval_steps_per_second": 4.325,
654
+ "step": 860
655
+ },
656
+ {
657
+ "epoch": 0.55,
658
+ "grad_norm": 0.37889736890792847,
659
+ "learning_rate": 0.00010227318273895532,
660
+ "loss": 0.3773,
661
+ "step": 880
662
+ },
663
+ {
664
+ "epoch": 0.55,
665
+ "eval_loss": 0.38970035314559937,
666
+ "eval_runtime": 165.5823,
667
+ "eval_samples_per_second": 17.315,
668
+ "eval_steps_per_second": 4.33,
669
+ "step": 880
670
+ },
671
+ {
672
+ "epoch": 0.56,
673
+ "grad_norm": 0.32016921043395996,
674
+ "learning_rate": 9.794327871645574e-05,
675
+ "loss": 0.3883,
676
+ "step": 900
677
+ },
678
+ {
679
+ "epoch": 0.56,
680
+ "eval_loss": 0.38903746008872986,
681
+ "eval_runtime": 165.5591,
682
+ "eval_samples_per_second": 17.317,
683
+ "eval_steps_per_second": 4.331,
684
+ "step": 900
685
+ },
686
+ {
687
+ "epoch": 0.57,
688
+ "grad_norm": 0.34894031286239624,
689
+ "learning_rate": 9.361723065369682e-05,
690
+ "loss": 0.3751,
691
+ "step": 920
692
+ },
693
+ {
694
+ "epoch": 0.57,
695
+ "eval_loss": 0.3883580267429352,
696
+ "eval_runtime": 165.5778,
697
+ "eval_samples_per_second": 17.315,
698
+ "eval_steps_per_second": 4.33,
699
+ "step": 920
700
+ },
701
+ {
702
+ "epoch": 0.58,
703
+ "grad_norm": 0.35315269231796265,
704
+ "learning_rate": 8.930314906487384e-05,
705
+ "loss": 0.3832,
706
+ "step": 940
707
+ },
708
+ {
709
+ "epoch": 0.58,
710
+ "eval_loss": 0.3874415457248688,
711
+ "eval_runtime": 165.6643,
712
+ "eval_samples_per_second": 17.306,
713
+ "eval_steps_per_second": 4.328,
714
+ "step": 940
715
+ },
716
+ {
717
+ "epoch": 0.6,
718
+ "grad_norm": 0.4276430904865265,
719
+ "learning_rate": 8.500912202932824e-05,
720
+ "loss": 0.3726,
721
+ "step": 960
722
+ },
723
+ {
724
+ "epoch": 0.6,
725
+ "eval_loss": 0.3868561387062073,
726
+ "eval_runtime": 165.6292,
727
+ "eval_samples_per_second": 17.31,
728
+ "eval_steps_per_second": 4.329,
729
+ "step": 960
730
+ },
731
+ {
732
+ "epoch": 0.61,
733
+ "grad_norm": 0.32810911536216736,
734
+ "learning_rate": 8.07432000279427e-05,
735
+ "loss": 0.3738,
736
+ "step": 980
737
+ },
738
+ {
739
+ "epoch": 0.61,
740
+ "eval_loss": 0.38609209656715393,
741
+ "eval_runtime": 165.6113,
742
+ "eval_samples_per_second": 17.312,
743
+ "eval_steps_per_second": 4.329,
744
+ "step": 980
745
+ },
746
+ {
747
+ "epoch": 0.62,
748
+ "grad_norm": 0.3598964214324951,
749
+ "learning_rate": 7.651338085002669e-05,
750
+ "loss": 0.3809,
751
+ "step": 1000
752
+ },
753
+ {
754
+ "epoch": 0.62,
755
+ "eval_loss": 0.3854670822620392,
756
+ "eval_runtime": 165.6347,
757
+ "eval_samples_per_second": 17.309,
758
+ "eval_steps_per_second": 4.329,
759
+ "step": 1000
760
+ },
761
+ {
762
+ "epoch": 0.63,
763
+ "grad_norm": 0.32283729314804077,
764
+ "learning_rate": 7.232759459898832e-05,
765
+ "loss": 0.3871,
766
+ "step": 1020
767
+ },
768
+ {
769
+ "epoch": 0.63,
770
+ "eval_loss": 0.38449159264564514,
771
+ "eval_runtime": 165.5636,
772
+ "eval_samples_per_second": 17.317,
773
+ "eval_steps_per_second": 4.331,
774
+ "step": 1020
775
+ },
776
+ {
777
+ "epoch": 0.64,
778
+ "grad_norm": 0.3151933252811432,
779
+ "learning_rate": 6.819368882490458e-05,
780
+ "loss": 0.3799,
781
+ "step": 1040
782
+ },
783
+ {
784
+ "epoch": 0.64,
785
+ "eval_loss": 0.3837529420852661,
786
+ "eval_runtime": 165.6596,
787
+ "eval_samples_per_second": 17.307,
788
+ "eval_steps_per_second": 4.328,
789
+ "step": 1040
790
+ },
791
+ {
792
+ "epoch": 0.66,
793
+ "grad_norm": 0.37252795696258545,
794
+ "learning_rate": 6.411941381186302e-05,
795
+ "loss": 0.3882,
796
+ "step": 1060
797
+ },
798
+ {
799
+ "epoch": 0.66,
800
+ "eval_loss": 0.38311225175857544,
801
+ "eval_runtime": 165.5928,
802
+ "eval_samples_per_second": 17.314,
803
+ "eval_steps_per_second": 4.33,
804
+ "step": 1060
805
+ },
806
+ {
807
+ "epoch": 0.67,
808
+ "grad_norm": 0.33380192518234253,
809
+ "learning_rate": 6.01124080476589e-05,
810
+ "loss": 0.3846,
811
+ "step": 1080
812
+ },
813
+ {
814
+ "epoch": 0.67,
815
+ "eval_loss": 0.3823437988758087,
816
+ "eval_runtime": 165.6364,
817
+ "eval_samples_per_second": 17.309,
818
+ "eval_steps_per_second": 4.329,
819
+ "step": 1080
820
+ },
821
+ {
822
+ "epoch": 0.68,
823
+ "grad_norm": 0.3543049991130829,
824
+ "learning_rate": 5.6180183903088844e-05,
825
+ "loss": 0.3696,
826
+ "step": 1100
827
+ },
828
+ {
829
+ "epoch": 0.68,
830
+ "eval_loss": 0.3821370601654053,
831
+ "eval_runtime": 165.5383,
832
+ "eval_samples_per_second": 17.319,
833
+ "eval_steps_per_second": 4.331,
834
+ "step": 1100
835
+ },
836
+ {
837
+ "epoch": 0.69,
838
+ "grad_norm": 0.374683141708374,
839
+ "learning_rate": 5.233011354768991e-05,
840
+ "loss": 0.3791,
841
+ "step": 1120
842
+ },
843
+ {
844
+ "epoch": 0.69,
845
+ "eval_loss": 0.38156434893608093,
846
+ "eval_runtime": 165.6726,
847
+ "eval_samples_per_second": 17.305,
848
+ "eval_steps_per_second": 4.328,
849
+ "step": 1120
850
+ },
851
+ {
852
+ "epoch": 0.71,
853
+ "grad_norm": 0.3851562738418579,
854
+ "learning_rate": 4.8569415128328945e-05,
855
+ "loss": 0.3726,
856
+ "step": 1140
857
+ },
858
+ {
859
+ "epoch": 0.71,
860
+ "eval_loss": 0.38082343339920044,
861
+ "eval_runtime": 165.6253,
862
+ "eval_samples_per_second": 17.31,
863
+ "eval_steps_per_second": 4.329,
864
+ "step": 1140
865
+ },
866
+ {
867
+ "epoch": 0.72,
868
+ "grad_norm": 0.422851026058197,
869
+ "learning_rate": 4.490513923655564e-05,
870
+ "loss": 0.3698,
871
+ "step": 1160
872
+ },
873
+ {
874
+ "epoch": 0.72,
875
+ "eval_loss": 0.38037535548210144,
876
+ "eval_runtime": 165.6523,
877
+ "eval_samples_per_second": 17.307,
878
+ "eval_steps_per_second": 4.328,
879
+ "step": 1160
880
+ },
881
+ {
882
+ "epoch": 0.73,
883
+ "grad_norm": 0.3657631278038025,
884
+ "learning_rate": 4.134415569008935e-05,
885
+ "loss": 0.3777,
886
+ "step": 1180
887
+ },
888
+ {
889
+ "epoch": 0.73,
890
+ "eval_loss": 0.3799656629562378,
891
+ "eval_runtime": 165.615,
892
+ "eval_samples_per_second": 17.311,
893
+ "eval_steps_per_second": 4.329,
894
+ "step": 1180
895
+ },
896
+ {
897
+ "epoch": 0.74,
898
+ "grad_norm": 0.34044766426086426,
899
+ "learning_rate": 3.789314065322218e-05,
900
+ "loss": 0.3637,
901
+ "step": 1200
902
+ },
903
+ {
904
+ "epoch": 0.74,
905
+ "eval_loss": 0.3793714940547943,
906
+ "eval_runtime": 165.687,
907
+ "eval_samples_per_second": 17.304,
908
+ "eval_steps_per_second": 4.327,
909
+ "step": 1200
910
+ },
911
+ {
912
+ "epoch": 0.76,
913
+ "grad_norm": 0.327467679977417,
914
+ "learning_rate": 3.455856412028593e-05,
915
+ "loss": 0.3653,
916
+ "step": 1220
917
+ },
918
+ {
919
+ "epoch": 0.76,
920
+ "eval_loss": 0.3786996603012085,
921
+ "eval_runtime": 165.7148,
922
+ "eval_samples_per_second": 17.301,
923
+ "eval_steps_per_second": 4.327,
924
+ "step": 1220
925
+ },
926
+ {
927
+ "epoch": 0.77,
928
+ "grad_norm": 0.3492739796638489,
929
+ "learning_rate": 3.1346677785647704e-05,
930
+ "loss": 0.382,
931
+ "step": 1240
932
+ },
933
+ {
934
+ "epoch": 0.77,
935
+ "eval_loss": 0.3782605230808258,
936
+ "eval_runtime": 165.621,
937
+ "eval_samples_per_second": 17.311,
938
+ "eval_steps_per_second": 4.329,
939
+ "step": 1240
940
+ },
941
+ {
942
+ "epoch": 0.78,
943
+ "grad_norm": 0.3024798333644867,
944
+ "learning_rate": 2.826350332297667e-05,
945
+ "loss": 0.3587,
946
+ "step": 1260
947
+ },
948
+ {
949
+ "epoch": 0.78,
950
+ "eval_loss": 0.37805166840553284,
951
+ "eval_runtime": 165.623,
952
+ "eval_samples_per_second": 17.31,
953
+ "eval_steps_per_second": 4.329,
954
+ "step": 1260
955
+ },
956
+ {
957
+ "epoch": 0.79,
958
+ "grad_norm": 0.3727082312107086,
959
+ "learning_rate": 2.531482109575547e-05,
960
+ "loss": 0.3729,
961
+ "step": 1280
962
+ },
963
+ {
964
+ "epoch": 0.79,
965
+ "eval_loss": 0.3775557577610016,
966
+ "eval_runtime": 165.6074,
967
+ "eval_samples_per_second": 17.312,
968
+ "eval_steps_per_second": 4.33,
969
+ "step": 1280
970
+ },
971
+ {
972
+ "epoch": 0.81,
973
+ "grad_norm": 0.41581809520721436,
974
+ "learning_rate": 2.250615932020238e-05,
975
+ "loss": 0.3731,
976
+ "step": 1300
977
+ },
978
+ {
979
+ "epoch": 0.81,
980
+ "eval_loss": 0.37723448872566223,
981
+ "eval_runtime": 165.6275,
982
+ "eval_samples_per_second": 17.31,
983
+ "eval_steps_per_second": 4.329,
984
+ "step": 1300
985
+ },
986
+ {
987
+ "epoch": 0.82,
988
+ "grad_norm": 0.44623810052871704,
989
+ "learning_rate": 1.9842783700921196e-05,
990
+ "loss": 0.3757,
991
+ "step": 1320
992
+ },
993
+ {
994
+ "epoch": 0.82,
995
+ "eval_loss": 0.3769790530204773,
996
+ "eval_runtime": 165.5377,
997
+ "eval_samples_per_second": 17.319,
998
+ "eval_steps_per_second": 4.331,
999
+ "step": 1320
1000
+ },
1001
+ {
1002
+ "epoch": 0.83,
1003
+ "grad_norm": 0.365567147731781,
1004
+ "learning_rate": 1.732968755871063e-05,
1005
+ "loss": 0.3733,
1006
+ "step": 1340
1007
+ },
1008
+ {
1009
+ "epoch": 0.83,
1010
+ "eval_loss": 0.3767223656177521,
1011
+ "eval_runtime": 165.6665,
1012
+ "eval_samples_per_second": 17.306,
1013
+ "eval_steps_per_second": 4.328,
1014
+ "step": 1340
1015
+ },
1016
+ {
1017
+ "epoch": 0.84,
1018
+ "grad_norm": 0.4132380187511444,
1019
+ "learning_rate": 1.4971582469040957e-05,
1020
+ "loss": 0.3792,
1021
+ "step": 1360
1022
+ },
1023
+ {
1024
+ "epoch": 0.84,
1025
+ "eval_loss": 0.3763655126094818,
1026
+ "eval_runtime": 165.6456,
1027
+ "eval_samples_per_second": 17.308,
1028
+ "eval_steps_per_second": 4.329,
1029
+ "step": 1360
1030
+ },
1031
+ {
1032
+ "epoch": 0.86,
1033
+ "grad_norm": 0.33449500799179077,
1034
+ "learning_rate": 1.2772889428749524e-05,
1035
+ "loss": 0.3678,
1036
+ "step": 1380
1037
+ },
1038
+ {
1039
+ "epoch": 0.86,
1040
+ "eval_loss": 0.3761462867259979,
1041
+ "eval_runtime": 165.6026,
1042
+ "eval_samples_per_second": 17.313,
1043
+ "eval_steps_per_second": 4.33,
1044
+ "step": 1380
1045
+ },
1046
+ {
1047
+ "epoch": 0.87,
1048
+ "grad_norm": 0.28829070925712585,
1049
+ "learning_rate": 1.0737730567515847e-05,
1050
+ "loss": 0.3604,
1051
+ "step": 1400
1052
+ },
1053
+ {
1054
+ "epoch": 0.87,
1055
+ "eval_loss": 0.3759004473686218,
1056
+ "eval_runtime": 165.5228,
1057
+ "eval_samples_per_second": 17.321,
1058
+ "eval_steps_per_second": 4.332,
1059
+ "step": 1400
1060
+ },
1061
+ {
1062
+ "epoch": 0.88,
1063
+ "grad_norm": 0.4277011454105377,
1064
+ "learning_rate": 8.869921419655457e-06,
1065
+ "loss": 0.3496,
1066
+ "step": 1420
1067
+ },
1068
+ {
1069
+ "epoch": 0.88,
1070
+ "eval_loss": 0.3757947087287903,
1071
+ "eval_runtime": 165.6492,
1072
+ "eval_samples_per_second": 17.308,
1073
+ "eval_steps_per_second": 4.328,
1074
+ "step": 1420
1075
+ },
1076
+ {
1077
+ "epoch": 0.89,
1078
+ "grad_norm": 0.40312379598617554,
1079
+ "learning_rate": 7.172963770721341e-06,
1080
+ "loss": 0.3676,
1081
+ "step": 1440
1082
+ },
1083
+ {
1084
+ "epoch": 0.89,
1085
+ "eval_loss": 0.3757094442844391,
1086
+ "eval_runtime": 165.5144,
1087
+ "eval_samples_per_second": 17.322,
1088
+ "eval_steps_per_second": 4.332,
1089
+ "step": 1440
1090
+ },
1091
+ {
1092
+ "epoch": 0.91,
1093
+ "grad_norm": 0.45307889580726624,
1094
+ "learning_rate": 5.650039092324766e-06,
1095
+ "loss": 0.3678,
1096
+ "step": 1460
1097
+ },
1098
+ {
1099
+ "epoch": 0.91,
1100
+ "eval_loss": 0.37566059827804565,
1101
+ "eval_runtime": 165.5183,
1102
+ "eval_samples_per_second": 17.321,
1103
+ "eval_steps_per_second": 4.332,
1104
+ "step": 1460
1105
+ },
1106
+ {
1107
+ "epoch": 0.92,
1108
+ "grad_norm": 0.31653299927711487,
1109
+ "learning_rate": 4.304002577483357e-06,
1110
+ "loss": 0.3646,
1111
+ "step": 1480
1112
+ },
1113
+ {
1114
+ "epoch": 0.92,
1115
+ "eval_loss": 0.3755495548248291,
1116
+ "eval_runtime": 165.5577,
1117
+ "eval_samples_per_second": 17.317,
1118
+ "eval_steps_per_second": 4.331,
1119
+ "step": 1480
1120
+ },
1121
+ {
1122
+ "epoch": 0.93,
1123
+ "grad_norm": 0.3237595856189728,
1124
+ "learning_rate": 3.13737778767923e-06,
1125
+ "loss": 0.3621,
1126
+ "step": 1500
1127
+ },
1128
+ {
1129
+ "epoch": 0.93,
1130
+ "eval_loss": 0.3754778504371643,
1131
+ "eval_runtime": 165.5564,
1132
+ "eval_samples_per_second": 17.317,
1133
+ "eval_steps_per_second": 4.331,
1134
+ "step": 1500
1135
+ },
1136
+ {
1137
+ "epoch": 0.94,
1138
+ "grad_norm": 0.41257408261299133,
1139
+ "learning_rate": 2.1523519216631094e-06,
1140
+ "loss": 0.3825,
1141
+ "step": 1520
1142
+ },
1143
+ {
1144
+ "epoch": 0.94,
1145
+ "eval_loss": 0.3754417300224304,
1146
+ "eval_runtime": 165.5324,
1147
+ "eval_samples_per_second": 17.32,
1148
+ "eval_steps_per_second": 4.331,
1149
+ "step": 1520
1150
+ },
1151
+ {
1152
+ "epoch": 0.95,
1153
+ "grad_norm": 0.3975638747215271,
1154
+ "learning_rate": 1.350771714874166e-06,
1155
+ "loss": 0.3718,
1156
+ "step": 1540
1157
+ },
1158
+ {
1159
+ "epoch": 0.95,
1160
+ "eval_loss": 0.37540262937545776,
1161
+ "eval_runtime": 165.5691,
1162
+ "eval_samples_per_second": 17.316,
1163
+ "eval_steps_per_second": 4.331,
1164
+ "step": 1540
1165
+ },
1166
+ {
1167
+ "epoch": 0.97,
1168
+ "grad_norm": 0.39280959963798523,
1169
+ "learning_rate": 7.341399771636948e-07,
1170
+ "loss": 0.3511,
1171
+ "step": 1560
1172
+ },
1173
+ {
1174
+ "epoch": 0.97,
1175
+ "eval_loss": 0.37537458539009094,
1176
+ "eval_runtime": 165.5674,
1177
+ "eval_samples_per_second": 17.316,
1178
+ "eval_steps_per_second": 4.331,
1179
+ "step": 1560
1180
+ }
1181
+ ],
1182
+ "logging_steps": 20,
1183
+ "max_steps": 1612,
1184
+ "num_input_tokens_seen": 0,
1185
+ "num_train_epochs": 1,
1186
+ "save_steps": 20,
1187
+ "total_flos": 8.055819878625444e+17,
1188
+ "train_batch_size": 8,
1189
+ "trial_name": null,
1190
+ "trial_params": null
1191
+ }
checkpoint-1560/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19cdc3b0645f297de07155d8f0cee10c20d51defbbc762523c42ad678ebd6dbd
3
+ size 5176
checkpoint-1580/README.md ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ base_model: mistralai/Mistral-7B-Instruct-v0.2
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+
201
+
202
+ ### Framework versions
203
+
204
+ - PEFT 0.8.2
checkpoint-1580/adapter_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.2",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "loftq_config": {},
12
+ "lora_alpha": 16,
13
+ "lora_dropout": 0.1,
14
+ "megatron_config": null,
15
+ "megatron_core": "megatron.core",
16
+ "modules_to_save": null,
17
+ "peft_type": "LORA",
18
+ "r": 8,
19
+ "rank_pattern": {},
20
+ "revision": null,
21
+ "target_modules": [
22
+ "v_proj",
23
+ "k_proj",
24
+ "q_proj",
25
+ "o_proj"
26
+ ],
27
+ "task_type": "CAUSAL_LM",
28
+ "use_rslora": false
29
+ }
checkpoint-1580/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dfab10e4d649130708f14db579c51291b350188b1f142c435d67658d36b5acf7
3
+ size 27297032
checkpoint-1580/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a748e2fc4af1a3e6064b55f91737d94015a16813ddf58c815ac7bd48528ad81
3
+ size 54678266
checkpoint-1580/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2fa4d12b7e14ada28ecca3a6072302d6039354f825ca91b7a577b31d5c095b84
3
+ size 14512
checkpoint-1580/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ad93b45e8d42d37fa8389806f755b2fb642dbe2d1044b5725d44451d2f523a0
3
+ size 14512
checkpoint-1580/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f6f015b3a90e3b88b787118422147290c34f47a18f4c24717d7fa2a49792c80
3
+ size 1000
checkpoint-1580/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
checkpoint-1580/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff