Dorn4449 commited on
Commit
73f7c18
·
verified ·
1 Parent(s): 9c2b8a7

checkpoint step 11500

Browse files
Files changed (45) hide show
  1. checkpoint-11200/README.md +202 -0
  2. checkpoint-11200/adapter_config.json +31 -0
  3. checkpoint-11200/adapter_model.safetensors +3 -0
  4. checkpoint-11200/checkpoint-6000/adapter_config.json +31 -0
  5. checkpoint-11200/checkpoint-6000/adapter_model.safetensors +3 -0
  6. checkpoint-11200/checkpoint-6100/adapter_config.json +31 -0
  7. checkpoint-11200/checkpoint-6100/adapter_model.safetensors +3 -0
  8. checkpoint-11200/optimizer.pt +3 -0
  9. checkpoint-11200/rng_state.pth +3 -0
  10. checkpoint-11200/scheduler.pt +3 -0
  11. checkpoint-11200/special_tokens_map.json +24 -0
  12. checkpoint-11200/tokenizer.model +3 -0
  13. checkpoint-11200/tokenizer_config.json +44 -0
  14. checkpoint-11200/trainer_state.json +994 -0
  15. checkpoint-11200/training_args.bin +3 -0
  16. checkpoint-11300/README.md +202 -0
  17. checkpoint-11300/adapter_config.json +31 -0
  18. checkpoint-11300/adapter_model.safetensors +3 -0
  19. checkpoint-11300/checkpoint-6000/adapter_config.json +31 -0
  20. checkpoint-11300/checkpoint-6000/adapter_model.safetensors +3 -0
  21. checkpoint-11300/checkpoint-6100/adapter_config.json +31 -0
  22. checkpoint-11300/checkpoint-6100/adapter_model.safetensors +3 -0
  23. checkpoint-11300/optimizer.pt +3 -0
  24. checkpoint-11300/rng_state.pth +3 -0
  25. checkpoint-11300/scheduler.pt +3 -0
  26. checkpoint-11300/special_tokens_map.json +24 -0
  27. checkpoint-11300/tokenizer.model +3 -0
  28. checkpoint-11300/tokenizer_config.json +44 -0
  29. checkpoint-11300/trainer_state.json +1001 -0
  30. checkpoint-11300/training_args.bin +3 -0
  31. checkpoint-11400/README.md +202 -0
  32. checkpoint-11400/adapter_config.json +31 -0
  33. checkpoint-11400/adapter_model.safetensors +3 -0
  34. checkpoint-11400/checkpoint-6000/adapter_config.json +31 -0
  35. checkpoint-11400/checkpoint-6000/adapter_model.safetensors +3 -0
  36. checkpoint-11400/checkpoint-6100/adapter_config.json +31 -0
  37. checkpoint-11400/checkpoint-6100/adapter_model.safetensors +3 -0
  38. checkpoint-11400/optimizer.pt +3 -0
  39. checkpoint-11400/rng_state.pth +3 -0
  40. checkpoint-11400/scheduler.pt +3 -0
  41. checkpoint-11400/special_tokens_map.json +24 -0
  42. checkpoint-11400/tokenizer.model +3 -0
  43. checkpoint-11400/tokenizer_config.json +44 -0
  44. checkpoint-11400/trainer_state.json +1008 -0
  45. checkpoint-11400/training_args.bin +3 -0
checkpoint-11200/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Dorn4449/CyberSentinel-Mistral-7B-v3.8
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.11.1
checkpoint-11200/adapter_config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "Dorn4449/CyberSentinel-Mistral-7B-v3.8",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 8,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "o_proj",
24
+ "v_proj",
25
+ "k_proj",
26
+ "q_proj"
27
+ ],
28
+ "task_type": "CAUSAL_LM",
29
+ "use_dora": false,
30
+ "use_rslora": false
31
+ }
checkpoint-11200/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28c7047bad95e2cd23d9ef1fa78cff591649d14c9a91d3d95a3cdc18297b8e72
3
+ size 27297032
checkpoint-11200/checkpoint-6000/adapter_config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "Dorn4449/CyberSentinel-Mistral-7B-v3.8",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 8,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "o_proj",
24
+ "v_proj",
25
+ "k_proj",
26
+ "q_proj"
27
+ ],
28
+ "task_type": "CAUSAL_LM",
29
+ "use_dora": false,
30
+ "use_rslora": false
31
+ }
checkpoint-11200/checkpoint-6000/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:286625e19ea92f6bcbc715bf5d40bac2ac85da8b9ecb31679d50c38cb4b4b694
3
+ size 27297032
checkpoint-11200/checkpoint-6100/adapter_config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "Dorn4449/CyberSentinel-Mistral-7B-v3.8",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 8,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "o_proj",
24
+ "v_proj",
25
+ "k_proj",
26
+ "q_proj"
27
+ ],
28
+ "task_type": "CAUSAL_LM",
29
+ "use_dora": false,
30
+ "use_rslora": false
31
+ }
checkpoint-11200/checkpoint-6100/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0a6ee2942c85113263a2331006fa30db088217df193b0fca3de2eebe282399b
3
+ size 27297032
checkpoint-11200/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a8f91450a6c1bed6f042a4de2d9990131c4aba4bb34d583733bd18c9322f7a3
3
+ size 54744314
checkpoint-11200/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:30ed91d21a0ed527cbdcf9522ea9871149c06a1cf76709bc2a0f4d697f53f237
3
+ size 14244
checkpoint-11200/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f536cb4b35f557b66d817fa1084e06b8bd2afc0fbabc02cc366681e17c917bb
3
+ size 1064
checkpoint-11200/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
checkpoint-11200/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
3
+ size 493443
checkpoint-11200/tokenizer_config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "additional_special_tokens": [],
32
+ "bos_token": "<s>",
33
+ "chat_template": "{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content'] %}\n {%- set loop_messages = messages[1:] %}\n{%- else %}\n {%- set loop_messages = messages %}\n{%- endif %}\n\n{{- bos_token }}\n{%- for message in loop_messages %}\n {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}\n {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}\n {%- endif %}\n {%- if message['role'] == 'user' %}\n {%- if loop.first and system_message is defined %}\n {{- ' [INST] ' + system_message + '\\n\\n' + message['content'] + ' [/INST]' }}\n {%- else %}\n {{- ' [INST] ' + message['content'] + ' [/INST]' }}\n {%- endif %}\n {%- elif message['role'] == 'assistant' %}\n {{- ' ' + message['content'] + eos_token}}\n {%- else %}\n {{- raise_exception('Only user and assistant roles are supported, with the exception of an initial optional system message!') }}\n {%- endif %}\n{%- endfor %}\n",
34
+ "clean_up_tokenization_spaces": false,
35
+ "eos_token": "</s>",
36
+ "legacy": false,
37
+ "model_max_length": 1000000000000000019884624838656,
38
+ "pad_token": "</s>",
39
+ "sp_model_kwargs": {},
40
+ "spaces_between_special_tokens": false,
41
+ "tokenizer_class": "LlamaTokenizer",
42
+ "unk_token": "<unk>",
43
+ "use_default_system_prompt": false
44
+ }
checkpoint-11200/trainer_state.json ADDED
@@ -0,0 +1,994 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.2986666666666667,
5
+ "eval_steps": 500,
6
+ "global_step": 11200,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "entropy": 1.084044404476881,
13
+ "epoch": 0.010666666666666666,
14
+ "grad_norm": 0.796875,
15
+ "learning_rate": 7.92e-07,
16
+ "loss": 1.3782466125488282,
17
+ "mean_token_accuracy": 0.7083857330679894,
18
+ "num_tokens": 986491.0,
19
+ "step": 100
20
+ },
21
+ {
22
+ "entropy": 1.1007767178118228,
23
+ "epoch": 0.021333333333333333,
24
+ "grad_norm": 0.3125,
25
+ "learning_rate": 1.592e-06,
26
+ "loss": 1.2902149963378906,
27
+ "mean_token_accuracy": 0.7171274860203266,
28
+ "num_tokens": 1952567.0,
29
+ "step": 200
30
+ },
31
+ {
32
+ "entropy": 1.0802835547924041,
33
+ "epoch": 0.032,
34
+ "grad_norm": 0.28515625,
35
+ "learning_rate": 2.392e-06,
36
+ "loss": 1.170371322631836,
37
+ "mean_token_accuracy": 0.7354862231016159,
38
+ "num_tokens": 2935479.0,
39
+ "step": 300
40
+ },
41
+ {
42
+ "entropy": 1.0573877203464508,
43
+ "epoch": 0.042666666666666665,
44
+ "grad_norm": 0.23046875,
45
+ "learning_rate": 3.192e-06,
46
+ "loss": 1.062843780517578,
47
+ "mean_token_accuracy": 0.7536254300177098,
48
+ "num_tokens": 3920344.0,
49
+ "step": 400
50
+ },
51
+ {
52
+ "entropy": 1.0149462178349495,
53
+ "epoch": 0.05333333333333334,
54
+ "grad_norm": 0.212890625,
55
+ "learning_rate": 3.992e-06,
56
+ "loss": 0.9590372467041015,
57
+ "mean_token_accuracy": 0.7738636130094528,
58
+ "num_tokens": 4899217.0,
59
+ "step": 500
60
+ },
61
+ {
62
+ "entropy": 0.8931056271493435,
63
+ "epoch": 0.064,
64
+ "grad_norm": 0.28125,
65
+ "learning_rate": 4.792e-06,
66
+ "loss": 0.7998863983154297,
67
+ "mean_token_accuracy": 0.8012332341074944,
68
+ "num_tokens": 5883391.0,
69
+ "step": 600
70
+ },
71
+ {
72
+ "entropy": 0.6136953190714121,
73
+ "epoch": 0.07466666666666667,
74
+ "grad_norm": 1.1015625,
75
+ "learning_rate": 5.592000000000001e-06,
76
+ "loss": 0.5131131362915039,
77
+ "mean_token_accuracy": 0.866335421949625,
78
+ "num_tokens": 6857125.0,
79
+ "step": 700
80
+ },
81
+ {
82
+ "entropy": 0.2726459547691047,
83
+ "epoch": 0.08533333333333333,
84
+ "grad_norm": 0.474609375,
85
+ "learning_rate": 6.392e-06,
86
+ "loss": 0.18439870834350586,
87
+ "mean_token_accuracy": 0.9579810079932213,
88
+ "num_tokens": 7831290.0,
89
+ "step": 800
90
+ },
91
+ {
92
+ "entropy": 0.11284050881862641,
93
+ "epoch": 0.096,
94
+ "grad_norm": 0.1875,
95
+ "learning_rate": 7.192e-06,
96
+ "loss": 0.05271556854248047,
97
+ "mean_token_accuracy": 0.9904264670610428,
98
+ "num_tokens": 8815417.0,
99
+ "step": 900
100
+ },
101
+ {
102
+ "entropy": 0.068436967888847,
103
+ "epoch": 0.10666666666666667,
104
+ "grad_norm": 0.125,
105
+ "learning_rate": 7.992e-06,
106
+ "loss": 0.02549468755722046,
107
+ "mean_token_accuracy": 0.9951217715442181,
108
+ "num_tokens": 9782033.0,
109
+ "step": 1000
110
+ },
111
+ {
112
+ "entropy": 0.052372096767649055,
113
+ "epoch": 0.11733333333333333,
114
+ "grad_norm": 0.1318359375,
115
+ "learning_rate": 8.792e-06,
116
+ "loss": 0.017259199619293213,
117
+ "mean_token_accuracy": 0.9962555834650993,
118
+ "num_tokens": 10759962.0,
119
+ "step": 1100
120
+ },
121
+ {
122
+ "entropy": 0.03862797610927373,
123
+ "epoch": 0.128,
124
+ "grad_norm": 0.0867946669459343,
125
+ "learning_rate": 9.591999999999999e-06,
126
+ "loss": 0.013324768543243408,
127
+ "mean_token_accuracy": 0.9968111206591129,
128
+ "num_tokens": 987986.0,
129
+ "step": 1200
130
+ },
131
+ {
132
+ "entropy": 0.032093781144358215,
133
+ "epoch": 0.13866666666666666,
134
+ "grad_norm": 0.037592533975839615,
135
+ "learning_rate": 1.0392e-05,
136
+ "loss": 0.013392001390457153,
137
+ "mean_token_accuracy": 0.9965760576725006,
138
+ "num_tokens": 1968127.0,
139
+ "step": 1300
140
+ },
141
+ {
142
+ "entropy": 0.031904329673852774,
143
+ "epoch": 0.14933333333333335,
144
+ "grad_norm": 0.10515860468149185,
145
+ "learning_rate": 1.1192e-05,
146
+ "loss": 0.016373103857040404,
147
+ "mean_token_accuracy": 0.9955054900050163,
148
+ "num_tokens": 2965677.0,
149
+ "step": 1400
150
+ },
151
+ {
152
+ "entropy": 0.0265167937008664,
153
+ "epoch": 0.16,
154
+ "grad_norm": 0.12689532339572906,
155
+ "learning_rate": 1.1992e-05,
156
+ "loss": 0.011554093360900878,
157
+ "mean_token_accuracy": 0.996508517563343,
158
+ "num_tokens": 3938144.0,
159
+ "step": 1500
160
+ },
161
+ {
162
+ "entropy": 0.024146563813555986,
163
+ "epoch": 0.17066666666666666,
164
+ "grad_norm": 0.03401608020067215,
165
+ "learning_rate": 1.2792e-05,
166
+ "loss": 0.010398292541503906,
167
+ "mean_token_accuracy": 0.9966818282008171,
168
+ "num_tokens": 4897019.0,
169
+ "step": 1600
170
+ },
171
+ {
172
+ "entropy": 0.025755486716516316,
173
+ "epoch": 0.18133333333333335,
174
+ "grad_norm": 0.05834396556019783,
175
+ "learning_rate": 1.3592000000000001e-05,
176
+ "loss": 0.012847075462341309,
177
+ "mean_token_accuracy": 0.9960671140253544,
178
+ "num_tokens": 5880125.0,
179
+ "step": 1700
180
+ },
181
+ {
182
+ "entropy": 0.023504739217460154,
183
+ "epoch": 0.192,
184
+ "grad_norm": 0.09117468446493149,
185
+ "learning_rate": 1.4392e-05,
186
+ "loss": 0.011869451999664306,
187
+ "mean_token_accuracy": 0.9964955732226372,
188
+ "num_tokens": 6867360.0,
189
+ "step": 1800
190
+ },
191
+ {
192
+ "entropy": 0.022944011739455164,
193
+ "epoch": 0.20266666666666666,
194
+ "grad_norm": 0.046192608773708344,
195
+ "learning_rate": 1.4978666666666668e-05,
196
+ "loss": 0.010620262622833252,
197
+ "mean_token_accuracy": 0.9964672869443894,
198
+ "num_tokens": 7840472.0,
199
+ "step": 1900
200
+ },
201
+ {
202
+ "entropy": 0.018894561287015676,
203
+ "epoch": 0.21333333333333335,
204
+ "grad_norm": 0.02761668898165226,
205
+ "learning_rate": 1.4889777777777778e-05,
206
+ "loss": 0.0067046540975570675,
207
+ "mean_token_accuracy": 0.9975774252414703,
208
+ "num_tokens": 8827474.0,
209
+ "step": 2000
210
+ },
211
+ {
212
+ "entropy": 0.020934174589347095,
213
+ "epoch": 0.224,
214
+ "grad_norm": 0.02931295707821846,
215
+ "learning_rate": 1.4800888888888889e-05,
216
+ "loss": 0.010140993595123292,
217
+ "mean_token_accuracy": 0.996714953482151,
218
+ "num_tokens": 9808475.0,
219
+ "step": 2100
220
+ },
221
+ {
222
+ "entropy": 0.020719884738791734,
223
+ "epoch": 0.23466666666666666,
224
+ "grad_norm": 0.02652113325893879,
225
+ "learning_rate": 1.4712e-05,
226
+ "loss": 0.010557392835617066,
227
+ "mean_token_accuracy": 0.9969246552884579,
228
+ "num_tokens": 10782725.0,
229
+ "step": 2200
230
+ },
231
+ {
232
+ "entropy": 0.020559412932489068,
233
+ "epoch": 0.24533333333333332,
234
+ "grad_norm": 0.032906673848629,
235
+ "learning_rate": 1.4623111111111113e-05,
236
+ "loss": 0.009789772033691406,
237
+ "mean_token_accuracy": 0.9966975942254066,
238
+ "num_tokens": 11769893.0,
239
+ "step": 2300
240
+ },
241
+ {
242
+ "entropy": 0.018388252432923764,
243
+ "epoch": 0.256,
244
+ "grad_norm": 0.08162333816289902,
245
+ "learning_rate": 1.4534222222222222e-05,
246
+ "loss": 0.007909480929374695,
247
+ "mean_token_accuracy": 0.9972486282885075,
248
+ "num_tokens": 12753597.0,
249
+ "step": 2400
250
+ },
251
+ {
252
+ "entropy": 0.02204789153067395,
253
+ "epoch": 0.26666666666666666,
254
+ "grad_norm": 0.023511990904808044,
255
+ "learning_rate": 1.4445333333333334e-05,
256
+ "loss": 0.011945382356643677,
257
+ "mean_token_accuracy": 0.9963182592391968,
258
+ "num_tokens": 13739044.0,
259
+ "step": 2500
260
+ },
261
+ {
262
+ "entropy": 0.02037038065260276,
263
+ "epoch": 0.2773333333333333,
264
+ "grad_norm": 0.02014540508389473,
265
+ "learning_rate": 1.4356444444444446e-05,
266
+ "loss": 0.010165022611618042,
267
+ "mean_token_accuracy": 0.9967658732831478,
268
+ "num_tokens": 14709501.0,
269
+ "step": 2600
270
+ },
271
+ {
272
+ "entropy": 0.01947357293218374,
273
+ "epoch": 0.288,
274
+ "grad_norm": 0.01829116977751255,
275
+ "learning_rate": 1.4267555555555555e-05,
276
+ "loss": 0.009100326895713806,
277
+ "mean_token_accuracy": 0.9969173397123814,
278
+ "num_tokens": 15693309.0,
279
+ "step": 2700
280
+ },
281
+ {
282
+ "entropy": 0.021565243480727078,
283
+ "epoch": 0.2986666666666667,
284
+ "grad_norm": 0.031732361763715744,
285
+ "learning_rate": 1.4178666666666667e-05,
286
+ "loss": 0.01107092022895813,
287
+ "mean_token_accuracy": 0.9963353677093982,
288
+ "num_tokens": 985520.0,
289
+ "step": 2800
290
+ },
291
+ {
292
+ "entropy": 0.02136099517461844,
293
+ "epoch": 0.30933333333333335,
294
+ "grad_norm": 0.03870174661278725,
295
+ "learning_rate": 1.4089777777777779e-05,
296
+ "loss": 0.011569523811340332,
297
+ "mean_token_accuracy": 0.9964252272248268,
298
+ "num_tokens": 1969572.0,
299
+ "step": 2900
300
+ },
301
+ {
302
+ "entropy": 0.019074252294376492,
303
+ "epoch": 0.32,
304
+ "grad_norm": 0.01954864338040352,
305
+ "learning_rate": 1.4000888888888888e-05,
306
+ "loss": 0.008727578520774841,
307
+ "mean_token_accuracy": 0.9970971086621284,
308
+ "num_tokens": 2945404.0,
309
+ "step": 3000
310
+ },
311
+ {
312
+ "entropy": 0.018130726229865102,
313
+ "epoch": 0.33066666666666666,
314
+ "grad_norm": 0.01979902759194374,
315
+ "learning_rate": 1.3912e-05,
316
+ "loss": 0.007898266315460206,
317
+ "mean_token_accuracy": 0.9971335357427598,
318
+ "num_tokens": 3925122.0,
319
+ "step": 3100
320
+ },
321
+ {
322
+ "entropy": 0.019400757937546587,
323
+ "epoch": 0.3413333333333333,
324
+ "grad_norm": 0.030273959040641785,
325
+ "learning_rate": 1.0980206234443522e-05,
326
+ "loss": 0.009979066749413809,
327
+ "mean_token_accuracy": 0.9968534293584526,
328
+ "num_tokens": 945781.0,
329
+ "step": 3200
330
+ },
331
+ {
332
+ "entropy": 0.021496493350714446,
333
+ "epoch": 0.352,
334
+ "grad_norm": 0.023218955844640732,
335
+ "learning_rate": 1.0802417921061989e-05,
336
+ "loss": 0.011733214855194091,
337
+ "mean_token_accuracy": 0.996385814100504,
338
+ "num_tokens": 1919451.0,
339
+ "step": 3300
340
+ },
341
+ {
342
+ "entropy": 0.01932728004641831,
343
+ "epoch": 0.3626666666666667,
344
+ "grad_norm": 0.014179096557199955,
345
+ "learning_rate": 1.0624629607680455e-05,
346
+ "loss": 0.00976854920387268,
347
+ "mean_token_accuracy": 0.9968989025056362,
348
+ "num_tokens": 2892220.0,
349
+ "step": 3400
350
+ },
351
+ {
352
+ "entropy": 0.02088767145993188,
353
+ "epoch": 0.37333333333333335,
354
+ "grad_norm": 0.032297272235155106,
355
+ "learning_rate": 1.0446841294298921e-05,
356
+ "loss": 0.011026575565338134,
357
+ "mean_token_accuracy": 0.9964779444038868,
358
+ "num_tokens": 3878184.0,
359
+ "step": 3500
360
+ },
361
+ {
362
+ "entropy": 0.018377634914308463,
363
+ "epoch": 0.384,
364
+ "grad_norm": 0.012006225995719433,
365
+ "learning_rate": 1.3467555555555556e-05,
366
+ "loss": 0.008617221661235975,
367
+ "mean_token_accuracy": 0.9970365500320559,
368
+ "num_tokens": 904843.0,
369
+ "step": 3600
370
+ },
371
+ {
372
+ "entropy": 0.016050403744447977,
373
+ "epoch": 0.39466666666666667,
374
+ "grad_norm": 0.011128585785627365,
375
+ "learning_rate": 1.3378666666666666e-05,
376
+ "loss": 0.006250782608985901,
377
+ "mean_token_accuracy": 0.9976212471723557,
378
+ "num_tokens": 1878860.0,
379
+ "step": 3700
380
+ },
381
+ {
382
+ "entropy": 0.01809800002258271,
383
+ "epoch": 0.4053333333333333,
384
+ "grad_norm": 0.01952100545167923,
385
+ "learning_rate": 1.3289777777777778e-05,
386
+ "loss": 0.008064679503440857,
387
+ "mean_token_accuracy": 0.9971583542227745,
388
+ "num_tokens": 2860744.0,
389
+ "step": 3800
390
+ },
391
+ {
392
+ "entropy": 0.019823117861524225,
393
+ "epoch": 0.416,
394
+ "grad_norm": 0.011735321022570133,
395
+ "learning_rate": 1.3200888888888889e-05,
396
+ "loss": 0.010386246442794799,
397
+ "mean_token_accuracy": 0.9966941741108895,
398
+ "num_tokens": 3831565.0,
399
+ "step": 3900
400
+ },
401
+ {
402
+ "entropy": 0.02049923066298889,
403
+ "epoch": 0.4266666666666667,
404
+ "grad_norm": 0.006284466944634914,
405
+ "learning_rate": 1.3112e-05,
406
+ "loss": 0.01045264061107192,
407
+ "mean_token_accuracy": 0.9964593903616418,
408
+ "num_tokens": 849872.0,
409
+ "step": 4000
410
+ },
411
+ {
412
+ "entropy": 0.017120514765847476,
413
+ "epoch": 0.43733333333333335,
414
+ "grad_norm": 0.010025433264672756,
415
+ "learning_rate": 1.3023111111111111e-05,
416
+ "loss": 0.007535084486007691,
417
+ "mean_token_accuracy": 0.9973040929436684,
418
+ "num_tokens": 1826365.0,
419
+ "step": 4100
420
+ },
421
+ {
422
+ "entropy": 0.01937343619065359,
423
+ "epoch": 0.448,
424
+ "grad_norm": 0.008356385864317417,
425
+ "learning_rate": 1.2934222222222222e-05,
426
+ "loss": 0.010190980434417725,
427
+ "mean_token_accuracy": 0.9967715987563133,
428
+ "num_tokens": 2802749.0,
429
+ "step": 4200
430
+ },
431
+ {
432
+ "entropy": 0.017232202125014737,
433
+ "epoch": 0.45866666666666667,
434
+ "grad_norm": 0.011389357037842274,
435
+ "learning_rate": 1.2845333333333334e-05,
436
+ "loss": 0.007943087816238403,
437
+ "mean_token_accuracy": 0.9973043432831764,
438
+ "num_tokens": 3802758.0,
439
+ "step": 4300
440
+ },
441
+ {
442
+ "entropy": 0.022822092252748984,
443
+ "epoch": 0.4693333333333333,
444
+ "grad_norm": 0.017312563955783844,
445
+ "learning_rate": 1.2756444444444444e-05,
446
+ "loss": 0.012838510819423346,
447
+ "mean_token_accuracy": 0.9959132893953795,
448
+ "num_tokens": 789736.0,
449
+ "step": 4400
450
+ },
451
+ {
452
+ "entropy": 0.0184966369275935,
453
+ "epoch": 0.48,
454
+ "grad_norm": 0.04721185564994812,
455
+ "learning_rate": 1.2667555555555557e-05,
456
+ "loss": 0.009318522214889526,
457
+ "mean_token_accuracy": 0.9968563948571681,
458
+ "num_tokens": 1767600.0,
459
+ "step": 4500
460
+ },
461
+ {
462
+ "entropy": 0.021548866296652706,
463
+ "epoch": 0.49066666666666664,
464
+ "grad_norm": 0.9198243021965027,
465
+ "learning_rate": 1.2578666666666667e-05,
466
+ "loss": 0.013902791738510133,
467
+ "mean_token_accuracy": 0.9959381237626076,
468
+ "num_tokens": 2754982.0,
469
+ "step": 4600
470
+ },
471
+ {
472
+ "entropy": 0.0234364516264759,
473
+ "epoch": 0.5013333333333333,
474
+ "grad_norm": 0.011652004905045033,
475
+ "learning_rate": 1.2489777777777779e-05,
476
+ "loss": 0.011085785627365112,
477
+ "mean_token_accuracy": 0.9966410009562969,
478
+ "num_tokens": 3730109.0,
479
+ "step": 4700
480
+ },
481
+ {
482
+ "entropy": 0.017517962178529856,
483
+ "epoch": 0.512,
484
+ "grad_norm": 0.008591280318796635,
485
+ "learning_rate": 1.240088888888889e-05,
486
+ "loss": 0.0068675024168831965,
487
+ "mean_token_accuracy": 0.9973819294533173,
488
+ "num_tokens": 753786.0,
489
+ "step": 4800
490
+ },
491
+ {
492
+ "entropy": 0.019476991441333667,
493
+ "epoch": 0.5226666666666666,
494
+ "grad_norm": 0.013633953407406807,
495
+ "learning_rate": 1.2312e-05,
496
+ "loss": 0.009302983283996582,
497
+ "mean_token_accuracy": 0.9968732745945453,
498
+ "num_tokens": 1732738.0,
499
+ "step": 4900
500
+ },
501
+ {
502
+ "entropy": 0.020917424112558366,
503
+ "epoch": 0.5333333333333333,
504
+ "grad_norm": 0.01434489618986845,
505
+ "learning_rate": 1.2223111111111112e-05,
506
+ "loss": 0.010165597200393678,
507
+ "mean_token_accuracy": 0.9965643344819546,
508
+ "num_tokens": 2702069.0,
509
+ "step": 5000
510
+ },
511
+ {
512
+ "entropy": 0.017627096675569193,
513
+ "epoch": 0.544,
514
+ "grad_norm": 0.010028124786913395,
515
+ "learning_rate": 1.2134222222222223e-05,
516
+ "loss": 0.008581981062889099,
517
+ "mean_token_accuracy": 0.9972231885790825,
518
+ "num_tokens": 3692514.0,
519
+ "step": 5100
520
+ },
521
+ {
522
+ "entropy": 0.018209505494293832,
523
+ "epoch": 0.5546666666666666,
524
+ "grad_norm": 0.010386968962848186,
525
+ "learning_rate": 1.2045333333333333e-05,
526
+ "loss": 0.00814423689971099,
527
+ "mean_token_accuracy": 0.9972281313023051,
528
+ "num_tokens": 739184.0,
529
+ "step": 5200
530
+ },
531
+ {
532
+ "entropy": 0.02268622429575771,
533
+ "epoch": 0.5653333333333334,
534
+ "grad_norm": 0.01215888187289238,
535
+ "learning_rate": 1.1956444444444445e-05,
536
+ "loss": 0.012614715099334716,
537
+ "mean_token_accuracy": 0.9960222035646439,
538
+ "num_tokens": 1710404.0,
539
+ "step": 5300
540
+ },
541
+ {
542
+ "entropy": 0.020103816259652376,
543
+ "epoch": 0.576,
544
+ "grad_norm": 0.014174265787005424,
545
+ "learning_rate": 1.1867555555555556e-05,
546
+ "loss": 0.010200117826461791,
547
+ "mean_token_accuracy": 0.9967270520329475,
548
+ "num_tokens": 2686777.0,
549
+ "step": 5400
550
+ },
551
+ {
552
+ "entropy": 0.020467385197989643,
553
+ "epoch": 0.5866666666666667,
554
+ "grad_norm": 0.014257642440497875,
555
+ "learning_rate": 1.1778666666666666e-05,
556
+ "loss": 0.010185201168060303,
557
+ "mean_token_accuracy": 0.9966062535345555,
558
+ "num_tokens": 3655445.0,
559
+ "step": 5500
560
+ },
561
+ {
562
+ "entropy": 0.020591240752474878,
563
+ "epoch": 0.5973333333333334,
564
+ "grad_norm": 0.011690130457282066,
565
+ "learning_rate": 1.1689777777777778e-05,
566
+ "loss": 0.010783259419427402,
567
+ "mean_token_accuracy": 0.9964483192433482,
568
+ "num_tokens": 686164.0,
569
+ "step": 5600
570
+ },
571
+ {
572
+ "entropy": 0.0205141630244907,
573
+ "epoch": 0.608,
574
+ "grad_norm": 0.006469405256211758,
575
+ "learning_rate": 1.160088888888889e-05,
576
+ "loss": 0.010492314100265503,
577
+ "mean_token_accuracy": 0.9966046234965324,
578
+ "num_tokens": 1662167.0,
579
+ "step": 5700
580
+ },
581
+ {
582
+ "entropy": 0.020147288321750237,
583
+ "epoch": 0.6186666666666667,
584
+ "grad_norm": 0.01614871807396412,
585
+ "learning_rate": 1.1512e-05,
586
+ "loss": 0.010045292377471924,
587
+ "mean_token_accuracy": 0.9966985350847244,
588
+ "num_tokens": 2643221.0,
589
+ "step": 5800
590
+ },
591
+ {
592
+ "entropy": 0.01912423676229082,
593
+ "epoch": 0.6293333333333333,
594
+ "grad_norm": 0.010019957087934017,
595
+ "learning_rate": 1.1423111111111111e-05,
596
+ "loss": 0.009680591821670533,
597
+ "mean_token_accuracy": 0.9968334528803825,
598
+ "num_tokens": 3611899.0,
599
+ "step": 5900
600
+ },
601
+ {
602
+ "epoch": 0.16,
603
+ "grad_norm": 1.419013500213623,
604
+ "learning_rate": 1.2e-05,
605
+ "loss": 0.0443,
606
+ "step": 6000
607
+ },
608
+ {
609
+ "epoch": 0.16266666666666665,
610
+ "grad_norm": 0.0630747601389885,
611
+ "learning_rate": 1.22e-05,
612
+ "loss": 0.0066,
613
+ "step": 6100
614
+ },
615
+ {
616
+ "epoch": 0.16533333333333333,
617
+ "grad_norm": 0.08701734989881516,
618
+ "learning_rate": 1.24e-05,
619
+ "loss": 0.0171,
620
+ "step": 6200
621
+ },
622
+ {
623
+ "epoch": 0.168,
624
+ "grad_norm": 0.7274932861328125,
625
+ "learning_rate": 1.26e-05,
626
+ "loss": 0.0065,
627
+ "step": 6300
628
+ },
629
+ {
630
+ "epoch": 0.17066666666666666,
631
+ "grad_norm": 0.07388290017843246,
632
+ "learning_rate": 1.2800000000000001e-05,
633
+ "loss": 0.0087,
634
+ "step": 6400
635
+ },
636
+ {
637
+ "epoch": 0.17333333333333334,
638
+ "grad_norm": 0.045733992010354996,
639
+ "learning_rate": 1.3000000000000001e-05,
640
+ "loss": 0.0114,
641
+ "step": 6500
642
+ },
643
+ {
644
+ "epoch": 0.176,
645
+ "grad_norm": 0.4374091625213623,
646
+ "learning_rate": 1.32e-05,
647
+ "loss": 0.0139,
648
+ "step": 6600
649
+ },
650
+ {
651
+ "epoch": 0.17866666666666667,
652
+ "grad_norm": 0.050165899097919464,
653
+ "learning_rate": 1.34e-05,
654
+ "loss": 0.0145,
655
+ "step": 6700
656
+ },
657
+ {
658
+ "epoch": 0.18133333333333335,
659
+ "grad_norm": 0.047875139862298965,
660
+ "learning_rate": 1.36e-05,
661
+ "loss": 0.0101,
662
+ "step": 6800
663
+ },
664
+ {
665
+ "epoch": 0.184,
666
+ "grad_norm": 0.06959123909473419,
667
+ "learning_rate": 1.3800000000000002e-05,
668
+ "loss": 0.0126,
669
+ "step": 6900
670
+ },
671
+ {
672
+ "epoch": 0.18666666666666668,
673
+ "grad_norm": 0.0340999960899353,
674
+ "learning_rate": 1.4e-05,
675
+ "loss": 0.0084,
676
+ "step": 7000
677
+ },
678
+ {
679
+ "epoch": 0.18933333333333333,
680
+ "grad_norm": 0.049361471086740494,
681
+ "learning_rate": 1.42e-05,
682
+ "loss": 0.0123,
683
+ "step": 7100
684
+ },
685
+ {
686
+ "epoch": 0.192,
687
+ "grad_norm": 0.11002720147371292,
688
+ "learning_rate": 1.44e-05,
689
+ "loss": 0.0111,
690
+ "step": 7200
691
+ },
692
+ {
693
+ "epoch": 0.19466666666666665,
694
+ "grad_norm": 0.04423600062727928,
695
+ "learning_rate": 1.46e-05,
696
+ "loss": 0.0082,
697
+ "step": 7300
698
+ },
699
+ {
700
+ "epoch": 0.19733333333333333,
701
+ "grad_norm": 0.052411410957574844,
702
+ "learning_rate": 1.48e-05,
703
+ "loss": 0.0042,
704
+ "step": 7400
705
+ },
706
+ {
707
+ "epoch": 0.2,
708
+ "grad_norm": 0.045492108911275864,
709
+ "learning_rate": 1.5e-05,
710
+ "loss": 0.02,
711
+ "step": 7500
712
+ },
713
+ {
714
+ "epoch": 0.20266666666666666,
715
+ "grad_norm": 0.07257890701293945,
716
+ "learning_rate": 1.4977777777777778e-05,
717
+ "loss": 0.0109,
718
+ "step": 7600
719
+ },
720
+ {
721
+ "epoch": 0.20533333333333334,
722
+ "grad_norm": 0.0558447502553463,
723
+ "learning_rate": 1.4955555555555556e-05,
724
+ "loss": 0.0065,
725
+ "step": 7700
726
+ },
727
+ {
728
+ "epoch": 0.208,
729
+ "grad_norm": 0.0893528088927269,
730
+ "learning_rate": 1.4933333333333333e-05,
731
+ "loss": 0.0067,
732
+ "step": 7800
733
+ },
734
+ {
735
+ "epoch": 0.21066666666666667,
736
+ "grad_norm": 0.033552590757608414,
737
+ "learning_rate": 1.4911111111111113e-05,
738
+ "loss": 0.0056,
739
+ "step": 7900
740
+ },
741
+ {
742
+ "epoch": 0.21333333333333335,
743
+ "grad_norm": 0.05111644044518471,
744
+ "learning_rate": 1.4888888888888888e-05,
745
+ "loss": 0.0061,
746
+ "step": 8000
747
+ },
748
+ {
749
+ "epoch": 0.216,
750
+ "grad_norm": 0.07947065681219101,
751
+ "learning_rate": 1.4866666666666668e-05,
752
+ "loss": 0.0078,
753
+ "step": 8100
754
+ },
755
+ {
756
+ "epoch": 0.21866666666666668,
757
+ "grad_norm": 0.04130572825670242,
758
+ "learning_rate": 1.4844444444444445e-05,
759
+ "loss": 0.0113,
760
+ "step": 8200
761
+ },
762
+ {
763
+ "epoch": 0.22133333333333333,
764
+ "grad_norm": 0.07327134907245636,
765
+ "learning_rate": 1.4822222222222221e-05,
766
+ "loss": 0.0128,
767
+ "step": 8300
768
+ },
769
+ {
770
+ "epoch": 0.224,
771
+ "grad_norm": 0.03684012219309807,
772
+ "learning_rate": 1.48e-05,
773
+ "loss": 0.0053,
774
+ "step": 8400
775
+ },
776
+ {
777
+ "epoch": 0.22666666666666666,
778
+ "grad_norm": 0.03126613050699234,
779
+ "learning_rate": 1.4777777777777778e-05,
780
+ "loss": 0.0079,
781
+ "step": 8500
782
+ },
783
+ {
784
+ "epoch": 0.22933333333333333,
785
+ "grad_norm": 0.06458591669797897,
786
+ "learning_rate": 1.4755555555555556e-05,
787
+ "loss": 0.016,
788
+ "step": 8600
789
+ },
790
+ {
791
+ "epoch": 0.232,
792
+ "grad_norm": 0.03598800674080849,
793
+ "learning_rate": 1.4733333333333333e-05,
794
+ "loss": 0.0043,
795
+ "step": 8700
796
+ },
797
+ {
798
+ "epoch": 0.23466666666666666,
799
+ "grad_norm": 0.032879263162612915,
800
+ "learning_rate": 1.4711111111111111e-05,
801
+ "loss": 0.01,
802
+ "step": 8800
803
+ },
804
+ {
805
+ "epoch": 0.23733333333333334,
806
+ "grad_norm": 0.04179929941892624,
807
+ "learning_rate": 1.4688888888888889e-05,
808
+ "loss": 0.0088,
809
+ "step": 8900
810
+ },
811
+ {
812
+ "epoch": 0.24,
813
+ "grad_norm": 0.025431055575609207,
814
+ "learning_rate": 1.4666666666666666e-05,
815
+ "loss": 0.0103,
816
+ "step": 9000
817
+ },
818
+ {
819
+ "epoch": 0.24266666666666667,
820
+ "grad_norm": 0.2131260633468628,
821
+ "learning_rate": 1.4644444444444446e-05,
822
+ "loss": 0.0144,
823
+ "step": 9100
824
+ },
825
+ {
826
+ "epoch": 0.24533333333333332,
827
+ "grad_norm": 0.03861390799283981,
828
+ "learning_rate": 1.4622222222222223e-05,
829
+ "loss": 0.0088,
830
+ "step": 9200
831
+ },
832
+ {
833
+ "epoch": 0.248,
834
+ "grad_norm": 0.026965836063027382,
835
+ "learning_rate": 1.46e-05,
836
+ "loss": 0.0102,
837
+ "step": 9300
838
+ },
839
+ {
840
+ "epoch": 0.25066666666666665,
841
+ "grad_norm": 0.02597722038626671,
842
+ "learning_rate": 1.4577777777777778e-05,
843
+ "loss": 0.0053,
844
+ "step": 9400
845
+ },
846
+ {
847
+ "epoch": 0.25333333333333335,
848
+ "grad_norm": 0.03126470744609833,
849
+ "learning_rate": 1.4555555555555556e-05,
850
+ "loss": 0.0077,
851
+ "step": 9500
852
+ },
853
+ {
854
+ "epoch": 0.256,
855
+ "grad_norm": 0.6931378841400146,
856
+ "learning_rate": 1.4533333333333334e-05,
857
+ "loss": 0.0077,
858
+ "step": 9600
859
+ },
860
+ {
861
+ "epoch": 0.25866666666666666,
862
+ "grad_norm": 0.026384815573692322,
863
+ "learning_rate": 1.4511111111111111e-05,
864
+ "loss": 0.0091,
865
+ "step": 9700
866
+ },
867
+ {
868
+ "epoch": 0.2613333333333333,
869
+ "grad_norm": 0.030717667192220688,
870
+ "learning_rate": 1.448888888888889e-05,
871
+ "loss": 0.0147,
872
+ "step": 9800
873
+ },
874
+ {
875
+ "epoch": 0.264,
876
+ "grad_norm": 0.02856365405023098,
877
+ "learning_rate": 1.4466666666666667e-05,
878
+ "loss": 0.0146,
879
+ "step": 9900
880
+ },
881
+ {
882
+ "epoch": 0.26666666666666666,
883
+ "grad_norm": 0.07391875237226486,
884
+ "learning_rate": 1.4444444444444444e-05,
885
+ "loss": 0.0072,
886
+ "step": 10000
887
+ },
888
+ {
889
+ "epoch": 0.2693333333333333,
890
+ "grad_norm": 0.026893191039562225,
891
+ "learning_rate": 1.4422222222222223e-05,
892
+ "loss": 0.0071,
893
+ "step": 10100
894
+ },
895
+ {
896
+ "epoch": 0.272,
897
+ "grad_norm": 0.04391603171825409,
898
+ "learning_rate": 1.44e-05,
899
+ "loss": 0.0124,
900
+ "step": 10200
901
+ },
902
+ {
903
+ "epoch": 0.27466666666666667,
904
+ "grad_norm": 0.03955280780792236,
905
+ "learning_rate": 1.4377777777777779e-05,
906
+ "loss": 0.0099,
907
+ "step": 10300
908
+ },
909
+ {
910
+ "epoch": 0.2773333333333333,
911
+ "grad_norm": 0.031186288222670555,
912
+ "learning_rate": 1.4355555555555556e-05,
913
+ "loss": 0.0099,
914
+ "step": 10400
915
+ },
916
+ {
917
+ "epoch": 0.28,
918
+ "grad_norm": 0.04440930485725403,
919
+ "learning_rate": 1.4333333333333334e-05,
920
+ "loss": 0.0042,
921
+ "step": 10500
922
+ },
923
+ {
924
+ "epoch": 0.2826666666666667,
925
+ "grad_norm": 0.035457074642181396,
926
+ "learning_rate": 1.4311111111111111e-05,
927
+ "loss": 0.0179,
928
+ "step": 10600
929
+ },
930
+ {
931
+ "epoch": 0.2853333333333333,
932
+ "grad_norm": 0.042004115879535675,
933
+ "learning_rate": 1.4288888888888889e-05,
934
+ "loss": 0.007,
935
+ "step": 10700
936
+ },
937
+ {
938
+ "epoch": 0.288,
939
+ "grad_norm": 0.0251617394387722,
940
+ "learning_rate": 1.4266666666666667e-05,
941
+ "loss": 0.0071,
942
+ "step": 10800
943
+ },
944
+ {
945
+ "epoch": 0.2906666666666667,
946
+ "grad_norm": 0.027697479352355003,
947
+ "learning_rate": 1.4244444444444444e-05,
948
+ "loss": 0.0046,
949
+ "step": 10900
950
+ },
951
+ {
952
+ "epoch": 0.29333333333333333,
953
+ "grad_norm": 0.035603832453489304,
954
+ "learning_rate": 1.4222222222222224e-05,
955
+ "loss": 0.0114,
956
+ "step": 11000
957
+ },
958
+ {
959
+ "epoch": 0.296,
960
+ "grad_norm": 0.2773999571800232,
961
+ "learning_rate": 1.42e-05,
962
+ "loss": 0.0195,
963
+ "step": 11100
964
+ },
965
+ {
966
+ "epoch": 0.2986666666666667,
967
+ "grad_norm": 0.041750043630599976,
968
+ "learning_rate": 1.4177777777777779e-05,
969
+ "loss": 0.0095,
970
+ "step": 11200
971
+ }
972
+ ],
973
+ "logging_steps": 100,
974
+ "max_steps": 75000,
975
+ "num_input_tokens_seen": 0,
976
+ "num_train_epochs": 2,
977
+ "save_steps": 100,
978
+ "stateful_callbacks": {
979
+ "TrainerControl": {
980
+ "args": {
981
+ "should_epoch_stop": false,
982
+ "should_evaluate": false,
983
+ "should_log": false,
984
+ "should_save": true,
985
+ "should_training_stop": false
986
+ },
987
+ "attributes": {}
988
+ }
989
+ },
990
+ "total_flos": 4.179260872801911e+18,
991
+ "train_batch_size": 4,
992
+ "trial_name": null,
993
+ "trial_params": null
994
+ }
checkpoint-11200/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c40a4c02bff4c325a39b29bbad54a640a82b1bc361c29cc2656ac8d29cf43eaa
3
+ size 5432
checkpoint-11300/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Dorn4449/CyberSentinel-Mistral-7B-v3.8
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.11.1
checkpoint-11300/adapter_config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "Dorn4449/CyberSentinel-Mistral-7B-v3.8",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 8,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "o_proj",
24
+ "v_proj",
25
+ "k_proj",
26
+ "q_proj"
27
+ ],
28
+ "task_type": "CAUSAL_LM",
29
+ "use_dora": false,
30
+ "use_rslora": false
31
+ }
checkpoint-11300/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9baa981864ae24733639a26defb5968da6c146de99de09cc5be4109925e2082a
3
+ size 27297032
checkpoint-11300/checkpoint-6000/adapter_config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "Dorn4449/CyberSentinel-Mistral-7B-v3.8",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 8,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "o_proj",
24
+ "v_proj",
25
+ "k_proj",
26
+ "q_proj"
27
+ ],
28
+ "task_type": "CAUSAL_LM",
29
+ "use_dora": false,
30
+ "use_rslora": false
31
+ }
checkpoint-11300/checkpoint-6000/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:286625e19ea92f6bcbc715bf5d40bac2ac85da8b9ecb31679d50c38cb4b4b694
3
+ size 27297032
checkpoint-11300/checkpoint-6100/adapter_config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "Dorn4449/CyberSentinel-Mistral-7B-v3.8",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 8,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "o_proj",
24
+ "v_proj",
25
+ "k_proj",
26
+ "q_proj"
27
+ ],
28
+ "task_type": "CAUSAL_LM",
29
+ "use_dora": false,
30
+ "use_rslora": false
31
+ }
checkpoint-11300/checkpoint-6100/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0a6ee2942c85113263a2331006fa30db088217df193b0fca3de2eebe282399b
3
+ size 27297032
checkpoint-11300/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35ea07d2004323e7bf5cee7511ae01761675d74e6ef563c4c56427b392f1b7e1
3
+ size 54744314
checkpoint-11300/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c93ff98a18552cb3fdcb913aaf4c75715777c8558d6294dc497b0ac307734565
3
+ size 14244
checkpoint-11300/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4abbf9c1ef6a4a21c65dfe078d0ed3395700462d8d0a37ffa60e810d3db7b51
3
+ size 1064
checkpoint-11300/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
checkpoint-11300/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
3
+ size 493443
checkpoint-11300/tokenizer_config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "additional_special_tokens": [],
32
+ "bos_token": "<s>",
33
+ "chat_template": "{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content'] %}\n {%- set loop_messages = messages[1:] %}\n{%- else %}\n {%- set loop_messages = messages %}\n{%- endif %}\n\n{{- bos_token }}\n{%- for message in loop_messages %}\n {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}\n {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}\n {%- endif %}\n {%- if message['role'] == 'user' %}\n {%- if loop.first and system_message is defined %}\n {{- ' [INST] ' + system_message + '\\n\\n' + message['content'] + ' [/INST]' }}\n {%- else %}\n {{- ' [INST] ' + message['content'] + ' [/INST]' }}\n {%- endif %}\n {%- elif message['role'] == 'assistant' %}\n {{- ' ' + message['content'] + eos_token}}\n {%- else %}\n {{- raise_exception('Only user and assistant roles are supported, with the exception of an initial optional system message!') }}\n {%- endif %}\n{%- endfor %}\n",
34
+ "clean_up_tokenization_spaces": false,
35
+ "eos_token": "</s>",
36
+ "legacy": false,
37
+ "model_max_length": 1000000000000000019884624838656,
38
+ "pad_token": "</s>",
39
+ "sp_model_kwargs": {},
40
+ "spaces_between_special_tokens": false,
41
+ "tokenizer_class": "LlamaTokenizer",
42
+ "unk_token": "<unk>",
43
+ "use_default_system_prompt": false
44
+ }
checkpoint-11300/trainer_state.json ADDED
@@ -0,0 +1,1001 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.30133333333333334,
5
+ "eval_steps": 500,
6
+ "global_step": 11300,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "entropy": 1.084044404476881,
13
+ "epoch": 0.010666666666666666,
14
+ "grad_norm": 0.796875,
15
+ "learning_rate": 7.92e-07,
16
+ "loss": 1.3782466125488282,
17
+ "mean_token_accuracy": 0.7083857330679894,
18
+ "num_tokens": 986491.0,
19
+ "step": 100
20
+ },
21
+ {
22
+ "entropy": 1.1007767178118228,
23
+ "epoch": 0.021333333333333333,
24
+ "grad_norm": 0.3125,
25
+ "learning_rate": 1.592e-06,
26
+ "loss": 1.2902149963378906,
27
+ "mean_token_accuracy": 0.7171274860203266,
28
+ "num_tokens": 1952567.0,
29
+ "step": 200
30
+ },
31
+ {
32
+ "entropy": 1.0802835547924041,
33
+ "epoch": 0.032,
34
+ "grad_norm": 0.28515625,
35
+ "learning_rate": 2.392e-06,
36
+ "loss": 1.170371322631836,
37
+ "mean_token_accuracy": 0.7354862231016159,
38
+ "num_tokens": 2935479.0,
39
+ "step": 300
40
+ },
41
+ {
42
+ "entropy": 1.0573877203464508,
43
+ "epoch": 0.042666666666666665,
44
+ "grad_norm": 0.23046875,
45
+ "learning_rate": 3.192e-06,
46
+ "loss": 1.062843780517578,
47
+ "mean_token_accuracy": 0.7536254300177098,
48
+ "num_tokens": 3920344.0,
49
+ "step": 400
50
+ },
51
+ {
52
+ "entropy": 1.0149462178349495,
53
+ "epoch": 0.05333333333333334,
54
+ "grad_norm": 0.212890625,
55
+ "learning_rate": 3.992e-06,
56
+ "loss": 0.9590372467041015,
57
+ "mean_token_accuracy": 0.7738636130094528,
58
+ "num_tokens": 4899217.0,
59
+ "step": 500
60
+ },
61
+ {
62
+ "entropy": 0.8931056271493435,
63
+ "epoch": 0.064,
64
+ "grad_norm": 0.28125,
65
+ "learning_rate": 4.792e-06,
66
+ "loss": 0.7998863983154297,
67
+ "mean_token_accuracy": 0.8012332341074944,
68
+ "num_tokens": 5883391.0,
69
+ "step": 600
70
+ },
71
+ {
72
+ "entropy": 0.6136953190714121,
73
+ "epoch": 0.07466666666666667,
74
+ "grad_norm": 1.1015625,
75
+ "learning_rate": 5.592000000000001e-06,
76
+ "loss": 0.5131131362915039,
77
+ "mean_token_accuracy": 0.866335421949625,
78
+ "num_tokens": 6857125.0,
79
+ "step": 700
80
+ },
81
+ {
82
+ "entropy": 0.2726459547691047,
83
+ "epoch": 0.08533333333333333,
84
+ "grad_norm": 0.474609375,
85
+ "learning_rate": 6.392e-06,
86
+ "loss": 0.18439870834350586,
87
+ "mean_token_accuracy": 0.9579810079932213,
88
+ "num_tokens": 7831290.0,
89
+ "step": 800
90
+ },
91
+ {
92
+ "entropy": 0.11284050881862641,
93
+ "epoch": 0.096,
94
+ "grad_norm": 0.1875,
95
+ "learning_rate": 7.192e-06,
96
+ "loss": 0.05271556854248047,
97
+ "mean_token_accuracy": 0.9904264670610428,
98
+ "num_tokens": 8815417.0,
99
+ "step": 900
100
+ },
101
+ {
102
+ "entropy": 0.068436967888847,
103
+ "epoch": 0.10666666666666667,
104
+ "grad_norm": 0.125,
105
+ "learning_rate": 7.992e-06,
106
+ "loss": 0.02549468755722046,
107
+ "mean_token_accuracy": 0.9951217715442181,
108
+ "num_tokens": 9782033.0,
109
+ "step": 1000
110
+ },
111
+ {
112
+ "entropy": 0.052372096767649055,
113
+ "epoch": 0.11733333333333333,
114
+ "grad_norm": 0.1318359375,
115
+ "learning_rate": 8.792e-06,
116
+ "loss": 0.017259199619293213,
117
+ "mean_token_accuracy": 0.9962555834650993,
118
+ "num_tokens": 10759962.0,
119
+ "step": 1100
120
+ },
121
+ {
122
+ "entropy": 0.03862797610927373,
123
+ "epoch": 0.128,
124
+ "grad_norm": 0.0867946669459343,
125
+ "learning_rate": 9.591999999999999e-06,
126
+ "loss": 0.013324768543243408,
127
+ "mean_token_accuracy": 0.9968111206591129,
128
+ "num_tokens": 987986.0,
129
+ "step": 1200
130
+ },
131
+ {
132
+ "entropy": 0.032093781144358215,
133
+ "epoch": 0.13866666666666666,
134
+ "grad_norm": 0.037592533975839615,
135
+ "learning_rate": 1.0392e-05,
136
+ "loss": 0.013392001390457153,
137
+ "mean_token_accuracy": 0.9965760576725006,
138
+ "num_tokens": 1968127.0,
139
+ "step": 1300
140
+ },
141
+ {
142
+ "entropy": 0.031904329673852774,
143
+ "epoch": 0.14933333333333335,
144
+ "grad_norm": 0.10515860468149185,
145
+ "learning_rate": 1.1192e-05,
146
+ "loss": 0.016373103857040404,
147
+ "mean_token_accuracy": 0.9955054900050163,
148
+ "num_tokens": 2965677.0,
149
+ "step": 1400
150
+ },
151
+ {
152
+ "entropy": 0.0265167937008664,
153
+ "epoch": 0.16,
154
+ "grad_norm": 0.12689532339572906,
155
+ "learning_rate": 1.1992e-05,
156
+ "loss": 0.011554093360900878,
157
+ "mean_token_accuracy": 0.996508517563343,
158
+ "num_tokens": 3938144.0,
159
+ "step": 1500
160
+ },
161
+ {
162
+ "entropy": 0.024146563813555986,
163
+ "epoch": 0.17066666666666666,
164
+ "grad_norm": 0.03401608020067215,
165
+ "learning_rate": 1.2792e-05,
166
+ "loss": 0.010398292541503906,
167
+ "mean_token_accuracy": 0.9966818282008171,
168
+ "num_tokens": 4897019.0,
169
+ "step": 1600
170
+ },
171
+ {
172
+ "entropy": 0.025755486716516316,
173
+ "epoch": 0.18133333333333335,
174
+ "grad_norm": 0.05834396556019783,
175
+ "learning_rate": 1.3592000000000001e-05,
176
+ "loss": 0.012847075462341309,
177
+ "mean_token_accuracy": 0.9960671140253544,
178
+ "num_tokens": 5880125.0,
179
+ "step": 1700
180
+ },
181
+ {
182
+ "entropy": 0.023504739217460154,
183
+ "epoch": 0.192,
184
+ "grad_norm": 0.09117468446493149,
185
+ "learning_rate": 1.4392e-05,
186
+ "loss": 0.011869451999664306,
187
+ "mean_token_accuracy": 0.9964955732226372,
188
+ "num_tokens": 6867360.0,
189
+ "step": 1800
190
+ },
191
+ {
192
+ "entropy": 0.022944011739455164,
193
+ "epoch": 0.20266666666666666,
194
+ "grad_norm": 0.046192608773708344,
195
+ "learning_rate": 1.4978666666666668e-05,
196
+ "loss": 0.010620262622833252,
197
+ "mean_token_accuracy": 0.9964672869443894,
198
+ "num_tokens": 7840472.0,
199
+ "step": 1900
200
+ },
201
+ {
202
+ "entropy": 0.018894561287015676,
203
+ "epoch": 0.21333333333333335,
204
+ "grad_norm": 0.02761668898165226,
205
+ "learning_rate": 1.4889777777777778e-05,
206
+ "loss": 0.0067046540975570675,
207
+ "mean_token_accuracy": 0.9975774252414703,
208
+ "num_tokens": 8827474.0,
209
+ "step": 2000
210
+ },
211
+ {
212
+ "entropy": 0.020934174589347095,
213
+ "epoch": 0.224,
214
+ "grad_norm": 0.02931295707821846,
215
+ "learning_rate": 1.4800888888888889e-05,
216
+ "loss": 0.010140993595123292,
217
+ "mean_token_accuracy": 0.996714953482151,
218
+ "num_tokens": 9808475.0,
219
+ "step": 2100
220
+ },
221
+ {
222
+ "entropy": 0.020719884738791734,
223
+ "epoch": 0.23466666666666666,
224
+ "grad_norm": 0.02652113325893879,
225
+ "learning_rate": 1.4712e-05,
226
+ "loss": 0.010557392835617066,
227
+ "mean_token_accuracy": 0.9969246552884579,
228
+ "num_tokens": 10782725.0,
229
+ "step": 2200
230
+ },
231
+ {
232
+ "entropy": 0.020559412932489068,
233
+ "epoch": 0.24533333333333332,
234
+ "grad_norm": 0.032906673848629,
235
+ "learning_rate": 1.4623111111111113e-05,
236
+ "loss": 0.009789772033691406,
237
+ "mean_token_accuracy": 0.9966975942254066,
238
+ "num_tokens": 11769893.0,
239
+ "step": 2300
240
+ },
241
+ {
242
+ "entropy": 0.018388252432923764,
243
+ "epoch": 0.256,
244
+ "grad_norm": 0.08162333816289902,
245
+ "learning_rate": 1.4534222222222222e-05,
246
+ "loss": 0.007909480929374695,
247
+ "mean_token_accuracy": 0.9972486282885075,
248
+ "num_tokens": 12753597.0,
249
+ "step": 2400
250
+ },
251
+ {
252
+ "entropy": 0.02204789153067395,
253
+ "epoch": 0.26666666666666666,
254
+ "grad_norm": 0.023511990904808044,
255
+ "learning_rate": 1.4445333333333334e-05,
256
+ "loss": 0.011945382356643677,
257
+ "mean_token_accuracy": 0.9963182592391968,
258
+ "num_tokens": 13739044.0,
259
+ "step": 2500
260
+ },
261
+ {
262
+ "entropy": 0.02037038065260276,
263
+ "epoch": 0.2773333333333333,
264
+ "grad_norm": 0.02014540508389473,
265
+ "learning_rate": 1.4356444444444446e-05,
266
+ "loss": 0.010165022611618042,
267
+ "mean_token_accuracy": 0.9967658732831478,
268
+ "num_tokens": 14709501.0,
269
+ "step": 2600
270
+ },
271
+ {
272
+ "entropy": 0.01947357293218374,
273
+ "epoch": 0.288,
274
+ "grad_norm": 0.01829116977751255,
275
+ "learning_rate": 1.4267555555555555e-05,
276
+ "loss": 0.009100326895713806,
277
+ "mean_token_accuracy": 0.9969173397123814,
278
+ "num_tokens": 15693309.0,
279
+ "step": 2700
280
+ },
281
+ {
282
+ "entropy": 0.021565243480727078,
283
+ "epoch": 0.2986666666666667,
284
+ "grad_norm": 0.031732361763715744,
285
+ "learning_rate": 1.4178666666666667e-05,
286
+ "loss": 0.01107092022895813,
287
+ "mean_token_accuracy": 0.9963353677093982,
288
+ "num_tokens": 985520.0,
289
+ "step": 2800
290
+ },
291
+ {
292
+ "entropy": 0.02136099517461844,
293
+ "epoch": 0.30933333333333335,
294
+ "grad_norm": 0.03870174661278725,
295
+ "learning_rate": 1.4089777777777779e-05,
296
+ "loss": 0.011569523811340332,
297
+ "mean_token_accuracy": 0.9964252272248268,
298
+ "num_tokens": 1969572.0,
299
+ "step": 2900
300
+ },
301
+ {
302
+ "entropy": 0.019074252294376492,
303
+ "epoch": 0.32,
304
+ "grad_norm": 0.01954864338040352,
305
+ "learning_rate": 1.4000888888888888e-05,
306
+ "loss": 0.008727578520774841,
307
+ "mean_token_accuracy": 0.9970971086621284,
308
+ "num_tokens": 2945404.0,
309
+ "step": 3000
310
+ },
311
+ {
312
+ "entropy": 0.018130726229865102,
313
+ "epoch": 0.33066666666666666,
314
+ "grad_norm": 0.01979902759194374,
315
+ "learning_rate": 1.3912e-05,
316
+ "loss": 0.007898266315460206,
317
+ "mean_token_accuracy": 0.9971335357427598,
318
+ "num_tokens": 3925122.0,
319
+ "step": 3100
320
+ },
321
+ {
322
+ "entropy": 0.019400757937546587,
323
+ "epoch": 0.3413333333333333,
324
+ "grad_norm": 0.030273959040641785,
325
+ "learning_rate": 1.0980206234443522e-05,
326
+ "loss": 0.009979066749413809,
327
+ "mean_token_accuracy": 0.9968534293584526,
328
+ "num_tokens": 945781.0,
329
+ "step": 3200
330
+ },
331
+ {
332
+ "entropy": 0.021496493350714446,
333
+ "epoch": 0.352,
334
+ "grad_norm": 0.023218955844640732,
335
+ "learning_rate": 1.0802417921061989e-05,
336
+ "loss": 0.011733214855194091,
337
+ "mean_token_accuracy": 0.996385814100504,
338
+ "num_tokens": 1919451.0,
339
+ "step": 3300
340
+ },
341
+ {
342
+ "entropy": 0.01932728004641831,
343
+ "epoch": 0.3626666666666667,
344
+ "grad_norm": 0.014179096557199955,
345
+ "learning_rate": 1.0624629607680455e-05,
346
+ "loss": 0.00976854920387268,
347
+ "mean_token_accuracy": 0.9968989025056362,
348
+ "num_tokens": 2892220.0,
349
+ "step": 3400
350
+ },
351
+ {
352
+ "entropy": 0.02088767145993188,
353
+ "epoch": 0.37333333333333335,
354
+ "grad_norm": 0.032297272235155106,
355
+ "learning_rate": 1.0446841294298921e-05,
356
+ "loss": 0.011026575565338134,
357
+ "mean_token_accuracy": 0.9964779444038868,
358
+ "num_tokens": 3878184.0,
359
+ "step": 3500
360
+ },
361
+ {
362
+ "entropy": 0.018377634914308463,
363
+ "epoch": 0.384,
364
+ "grad_norm": 0.012006225995719433,
365
+ "learning_rate": 1.3467555555555556e-05,
366
+ "loss": 0.008617221661235975,
367
+ "mean_token_accuracy": 0.9970365500320559,
368
+ "num_tokens": 904843.0,
369
+ "step": 3600
370
+ },
371
+ {
372
+ "entropy": 0.016050403744447977,
373
+ "epoch": 0.39466666666666667,
374
+ "grad_norm": 0.011128585785627365,
375
+ "learning_rate": 1.3378666666666666e-05,
376
+ "loss": 0.006250782608985901,
377
+ "mean_token_accuracy": 0.9976212471723557,
378
+ "num_tokens": 1878860.0,
379
+ "step": 3700
380
+ },
381
+ {
382
+ "entropy": 0.01809800002258271,
383
+ "epoch": 0.4053333333333333,
384
+ "grad_norm": 0.01952100545167923,
385
+ "learning_rate": 1.3289777777777778e-05,
386
+ "loss": 0.008064679503440857,
387
+ "mean_token_accuracy": 0.9971583542227745,
388
+ "num_tokens": 2860744.0,
389
+ "step": 3800
390
+ },
391
+ {
392
+ "entropy": 0.019823117861524225,
393
+ "epoch": 0.416,
394
+ "grad_norm": 0.011735321022570133,
395
+ "learning_rate": 1.3200888888888889e-05,
396
+ "loss": 0.010386246442794799,
397
+ "mean_token_accuracy": 0.9966941741108895,
398
+ "num_tokens": 3831565.0,
399
+ "step": 3900
400
+ },
401
+ {
402
+ "entropy": 0.02049923066298889,
403
+ "epoch": 0.4266666666666667,
404
+ "grad_norm": 0.006284466944634914,
405
+ "learning_rate": 1.3112e-05,
406
+ "loss": 0.01045264061107192,
407
+ "mean_token_accuracy": 0.9964593903616418,
408
+ "num_tokens": 849872.0,
409
+ "step": 4000
410
+ },
411
+ {
412
+ "entropy": 0.017120514765847476,
413
+ "epoch": 0.43733333333333335,
414
+ "grad_norm": 0.010025433264672756,
415
+ "learning_rate": 1.3023111111111111e-05,
416
+ "loss": 0.007535084486007691,
417
+ "mean_token_accuracy": 0.9973040929436684,
418
+ "num_tokens": 1826365.0,
419
+ "step": 4100
420
+ },
421
+ {
422
+ "entropy": 0.01937343619065359,
423
+ "epoch": 0.448,
424
+ "grad_norm": 0.008356385864317417,
425
+ "learning_rate": 1.2934222222222222e-05,
426
+ "loss": 0.010190980434417725,
427
+ "mean_token_accuracy": 0.9967715987563133,
428
+ "num_tokens": 2802749.0,
429
+ "step": 4200
430
+ },
431
+ {
432
+ "entropy": 0.017232202125014737,
433
+ "epoch": 0.45866666666666667,
434
+ "grad_norm": 0.011389357037842274,
435
+ "learning_rate": 1.2845333333333334e-05,
436
+ "loss": 0.007943087816238403,
437
+ "mean_token_accuracy": 0.9973043432831764,
438
+ "num_tokens": 3802758.0,
439
+ "step": 4300
440
+ },
441
+ {
442
+ "entropy": 0.022822092252748984,
443
+ "epoch": 0.4693333333333333,
444
+ "grad_norm": 0.017312563955783844,
445
+ "learning_rate": 1.2756444444444444e-05,
446
+ "loss": 0.012838510819423346,
447
+ "mean_token_accuracy": 0.9959132893953795,
448
+ "num_tokens": 789736.0,
449
+ "step": 4400
450
+ },
451
+ {
452
+ "entropy": 0.0184966369275935,
453
+ "epoch": 0.48,
454
+ "grad_norm": 0.04721185564994812,
455
+ "learning_rate": 1.2667555555555557e-05,
456
+ "loss": 0.009318522214889526,
457
+ "mean_token_accuracy": 0.9968563948571681,
458
+ "num_tokens": 1767600.0,
459
+ "step": 4500
460
+ },
461
+ {
462
+ "entropy": 0.021548866296652706,
463
+ "epoch": 0.49066666666666664,
464
+ "grad_norm": 0.9198243021965027,
465
+ "learning_rate": 1.2578666666666667e-05,
466
+ "loss": 0.013902791738510133,
467
+ "mean_token_accuracy": 0.9959381237626076,
468
+ "num_tokens": 2754982.0,
469
+ "step": 4600
470
+ },
471
+ {
472
+ "entropy": 0.0234364516264759,
473
+ "epoch": 0.5013333333333333,
474
+ "grad_norm": 0.011652004905045033,
475
+ "learning_rate": 1.2489777777777779e-05,
476
+ "loss": 0.011085785627365112,
477
+ "mean_token_accuracy": 0.9966410009562969,
478
+ "num_tokens": 3730109.0,
479
+ "step": 4700
480
+ },
481
+ {
482
+ "entropy": 0.017517962178529856,
483
+ "epoch": 0.512,
484
+ "grad_norm": 0.008591280318796635,
485
+ "learning_rate": 1.240088888888889e-05,
486
+ "loss": 0.0068675024168831965,
487
+ "mean_token_accuracy": 0.9973819294533173,
488
+ "num_tokens": 753786.0,
489
+ "step": 4800
490
+ },
491
+ {
492
+ "entropy": 0.019476991441333667,
493
+ "epoch": 0.5226666666666666,
494
+ "grad_norm": 0.013633953407406807,
495
+ "learning_rate": 1.2312e-05,
496
+ "loss": 0.009302983283996582,
497
+ "mean_token_accuracy": 0.9968732745945453,
498
+ "num_tokens": 1732738.0,
499
+ "step": 4900
500
+ },
501
+ {
502
+ "entropy": 0.020917424112558366,
503
+ "epoch": 0.5333333333333333,
504
+ "grad_norm": 0.01434489618986845,
505
+ "learning_rate": 1.2223111111111112e-05,
506
+ "loss": 0.010165597200393678,
507
+ "mean_token_accuracy": 0.9965643344819546,
508
+ "num_tokens": 2702069.0,
509
+ "step": 5000
510
+ },
511
+ {
512
+ "entropy": 0.017627096675569193,
513
+ "epoch": 0.544,
514
+ "grad_norm": 0.010028124786913395,
515
+ "learning_rate": 1.2134222222222223e-05,
516
+ "loss": 0.008581981062889099,
517
+ "mean_token_accuracy": 0.9972231885790825,
518
+ "num_tokens": 3692514.0,
519
+ "step": 5100
520
+ },
521
+ {
522
+ "entropy": 0.018209505494293832,
523
+ "epoch": 0.5546666666666666,
524
+ "grad_norm": 0.010386968962848186,
525
+ "learning_rate": 1.2045333333333333e-05,
526
+ "loss": 0.00814423689971099,
527
+ "mean_token_accuracy": 0.9972281313023051,
528
+ "num_tokens": 739184.0,
529
+ "step": 5200
530
+ },
531
+ {
532
+ "entropy": 0.02268622429575771,
533
+ "epoch": 0.5653333333333334,
534
+ "grad_norm": 0.01215888187289238,
535
+ "learning_rate": 1.1956444444444445e-05,
536
+ "loss": 0.012614715099334716,
537
+ "mean_token_accuracy": 0.9960222035646439,
538
+ "num_tokens": 1710404.0,
539
+ "step": 5300
540
+ },
541
+ {
542
+ "entropy": 0.020103816259652376,
543
+ "epoch": 0.576,
544
+ "grad_norm": 0.014174265787005424,
545
+ "learning_rate": 1.1867555555555556e-05,
546
+ "loss": 0.010200117826461791,
547
+ "mean_token_accuracy": 0.9967270520329475,
548
+ "num_tokens": 2686777.0,
549
+ "step": 5400
550
+ },
551
+ {
552
+ "entropy": 0.020467385197989643,
553
+ "epoch": 0.5866666666666667,
554
+ "grad_norm": 0.014257642440497875,
555
+ "learning_rate": 1.1778666666666666e-05,
556
+ "loss": 0.010185201168060303,
557
+ "mean_token_accuracy": 0.9966062535345555,
558
+ "num_tokens": 3655445.0,
559
+ "step": 5500
560
+ },
561
+ {
562
+ "entropy": 0.020591240752474878,
563
+ "epoch": 0.5973333333333334,
564
+ "grad_norm": 0.011690130457282066,
565
+ "learning_rate": 1.1689777777777778e-05,
566
+ "loss": 0.010783259419427402,
567
+ "mean_token_accuracy": 0.9964483192433482,
568
+ "num_tokens": 686164.0,
569
+ "step": 5600
570
+ },
571
+ {
572
+ "entropy": 0.0205141630244907,
573
+ "epoch": 0.608,
574
+ "grad_norm": 0.006469405256211758,
575
+ "learning_rate": 1.160088888888889e-05,
576
+ "loss": 0.010492314100265503,
577
+ "mean_token_accuracy": 0.9966046234965324,
578
+ "num_tokens": 1662167.0,
579
+ "step": 5700
580
+ },
581
+ {
582
+ "entropy": 0.020147288321750237,
583
+ "epoch": 0.6186666666666667,
584
+ "grad_norm": 0.01614871807396412,
585
+ "learning_rate": 1.1512e-05,
586
+ "loss": 0.010045292377471924,
587
+ "mean_token_accuracy": 0.9966985350847244,
588
+ "num_tokens": 2643221.0,
589
+ "step": 5800
590
+ },
591
+ {
592
+ "entropy": 0.01912423676229082,
593
+ "epoch": 0.6293333333333333,
594
+ "grad_norm": 0.010019957087934017,
595
+ "learning_rate": 1.1423111111111111e-05,
596
+ "loss": 0.009680591821670533,
597
+ "mean_token_accuracy": 0.9968334528803825,
598
+ "num_tokens": 3611899.0,
599
+ "step": 5900
600
+ },
601
+ {
602
+ "epoch": 0.16,
603
+ "grad_norm": 1.419013500213623,
604
+ "learning_rate": 1.2e-05,
605
+ "loss": 0.0443,
606
+ "step": 6000
607
+ },
608
+ {
609
+ "epoch": 0.16266666666666665,
610
+ "grad_norm": 0.0630747601389885,
611
+ "learning_rate": 1.22e-05,
612
+ "loss": 0.0066,
613
+ "step": 6100
614
+ },
615
+ {
616
+ "epoch": 0.16533333333333333,
617
+ "grad_norm": 0.08701734989881516,
618
+ "learning_rate": 1.24e-05,
619
+ "loss": 0.0171,
620
+ "step": 6200
621
+ },
622
+ {
623
+ "epoch": 0.168,
624
+ "grad_norm": 0.7274932861328125,
625
+ "learning_rate": 1.26e-05,
626
+ "loss": 0.0065,
627
+ "step": 6300
628
+ },
629
+ {
630
+ "epoch": 0.17066666666666666,
631
+ "grad_norm": 0.07388290017843246,
632
+ "learning_rate": 1.2800000000000001e-05,
633
+ "loss": 0.0087,
634
+ "step": 6400
635
+ },
636
+ {
637
+ "epoch": 0.17333333333333334,
638
+ "grad_norm": 0.045733992010354996,
639
+ "learning_rate": 1.3000000000000001e-05,
640
+ "loss": 0.0114,
641
+ "step": 6500
642
+ },
643
+ {
644
+ "epoch": 0.176,
645
+ "grad_norm": 0.4374091625213623,
646
+ "learning_rate": 1.32e-05,
647
+ "loss": 0.0139,
648
+ "step": 6600
649
+ },
650
+ {
651
+ "epoch": 0.17866666666666667,
652
+ "grad_norm": 0.050165899097919464,
653
+ "learning_rate": 1.34e-05,
654
+ "loss": 0.0145,
655
+ "step": 6700
656
+ },
657
+ {
658
+ "epoch": 0.18133333333333335,
659
+ "grad_norm": 0.047875139862298965,
660
+ "learning_rate": 1.36e-05,
661
+ "loss": 0.0101,
662
+ "step": 6800
663
+ },
664
+ {
665
+ "epoch": 0.184,
666
+ "grad_norm": 0.06959123909473419,
667
+ "learning_rate": 1.3800000000000002e-05,
668
+ "loss": 0.0126,
669
+ "step": 6900
670
+ },
671
+ {
672
+ "epoch": 0.18666666666666668,
673
+ "grad_norm": 0.0340999960899353,
674
+ "learning_rate": 1.4e-05,
675
+ "loss": 0.0084,
676
+ "step": 7000
677
+ },
678
+ {
679
+ "epoch": 0.18933333333333333,
680
+ "grad_norm": 0.049361471086740494,
681
+ "learning_rate": 1.42e-05,
682
+ "loss": 0.0123,
683
+ "step": 7100
684
+ },
685
+ {
686
+ "epoch": 0.192,
687
+ "grad_norm": 0.11002720147371292,
688
+ "learning_rate": 1.44e-05,
689
+ "loss": 0.0111,
690
+ "step": 7200
691
+ },
692
+ {
693
+ "epoch": 0.19466666666666665,
694
+ "grad_norm": 0.04423600062727928,
695
+ "learning_rate": 1.46e-05,
696
+ "loss": 0.0082,
697
+ "step": 7300
698
+ },
699
+ {
700
+ "epoch": 0.19733333333333333,
701
+ "grad_norm": 0.052411410957574844,
702
+ "learning_rate": 1.48e-05,
703
+ "loss": 0.0042,
704
+ "step": 7400
705
+ },
706
+ {
707
+ "epoch": 0.2,
708
+ "grad_norm": 0.045492108911275864,
709
+ "learning_rate": 1.5e-05,
710
+ "loss": 0.02,
711
+ "step": 7500
712
+ },
713
+ {
714
+ "epoch": 0.20266666666666666,
715
+ "grad_norm": 0.07257890701293945,
716
+ "learning_rate": 1.4977777777777778e-05,
717
+ "loss": 0.0109,
718
+ "step": 7600
719
+ },
720
+ {
721
+ "epoch": 0.20533333333333334,
722
+ "grad_norm": 0.0558447502553463,
723
+ "learning_rate": 1.4955555555555556e-05,
724
+ "loss": 0.0065,
725
+ "step": 7700
726
+ },
727
+ {
728
+ "epoch": 0.208,
729
+ "grad_norm": 0.0893528088927269,
730
+ "learning_rate": 1.4933333333333333e-05,
731
+ "loss": 0.0067,
732
+ "step": 7800
733
+ },
734
+ {
735
+ "epoch": 0.21066666666666667,
736
+ "grad_norm": 0.033552590757608414,
737
+ "learning_rate": 1.4911111111111113e-05,
738
+ "loss": 0.0056,
739
+ "step": 7900
740
+ },
741
+ {
742
+ "epoch": 0.21333333333333335,
743
+ "grad_norm": 0.05111644044518471,
744
+ "learning_rate": 1.4888888888888888e-05,
745
+ "loss": 0.0061,
746
+ "step": 8000
747
+ },
748
+ {
749
+ "epoch": 0.216,
750
+ "grad_norm": 0.07947065681219101,
751
+ "learning_rate": 1.4866666666666668e-05,
752
+ "loss": 0.0078,
753
+ "step": 8100
754
+ },
755
+ {
756
+ "epoch": 0.21866666666666668,
757
+ "grad_norm": 0.04130572825670242,
758
+ "learning_rate": 1.4844444444444445e-05,
759
+ "loss": 0.0113,
760
+ "step": 8200
761
+ },
762
+ {
763
+ "epoch": 0.22133333333333333,
764
+ "grad_norm": 0.07327134907245636,
765
+ "learning_rate": 1.4822222222222221e-05,
766
+ "loss": 0.0128,
767
+ "step": 8300
768
+ },
769
+ {
770
+ "epoch": 0.224,
771
+ "grad_norm": 0.03684012219309807,
772
+ "learning_rate": 1.48e-05,
773
+ "loss": 0.0053,
774
+ "step": 8400
775
+ },
776
+ {
777
+ "epoch": 0.22666666666666666,
778
+ "grad_norm": 0.03126613050699234,
779
+ "learning_rate": 1.4777777777777778e-05,
780
+ "loss": 0.0079,
781
+ "step": 8500
782
+ },
783
+ {
784
+ "epoch": 0.22933333333333333,
785
+ "grad_norm": 0.06458591669797897,
786
+ "learning_rate": 1.4755555555555556e-05,
787
+ "loss": 0.016,
788
+ "step": 8600
789
+ },
790
+ {
791
+ "epoch": 0.232,
792
+ "grad_norm": 0.03598800674080849,
793
+ "learning_rate": 1.4733333333333333e-05,
794
+ "loss": 0.0043,
795
+ "step": 8700
796
+ },
797
+ {
798
+ "epoch": 0.23466666666666666,
799
+ "grad_norm": 0.032879263162612915,
800
+ "learning_rate": 1.4711111111111111e-05,
801
+ "loss": 0.01,
802
+ "step": 8800
803
+ },
804
+ {
805
+ "epoch": 0.23733333333333334,
806
+ "grad_norm": 0.04179929941892624,
807
+ "learning_rate": 1.4688888888888889e-05,
808
+ "loss": 0.0088,
809
+ "step": 8900
810
+ },
811
+ {
812
+ "epoch": 0.24,
813
+ "grad_norm": 0.025431055575609207,
814
+ "learning_rate": 1.4666666666666666e-05,
815
+ "loss": 0.0103,
816
+ "step": 9000
817
+ },
818
+ {
819
+ "epoch": 0.24266666666666667,
820
+ "grad_norm": 0.2131260633468628,
821
+ "learning_rate": 1.4644444444444446e-05,
822
+ "loss": 0.0144,
823
+ "step": 9100
824
+ },
825
+ {
826
+ "epoch": 0.24533333333333332,
827
+ "grad_norm": 0.03861390799283981,
828
+ "learning_rate": 1.4622222222222223e-05,
829
+ "loss": 0.0088,
830
+ "step": 9200
831
+ },
832
+ {
833
+ "epoch": 0.248,
834
+ "grad_norm": 0.026965836063027382,
835
+ "learning_rate": 1.46e-05,
836
+ "loss": 0.0102,
837
+ "step": 9300
838
+ },
839
+ {
840
+ "epoch": 0.25066666666666665,
841
+ "grad_norm": 0.02597722038626671,
842
+ "learning_rate": 1.4577777777777778e-05,
843
+ "loss": 0.0053,
844
+ "step": 9400
845
+ },
846
+ {
847
+ "epoch": 0.25333333333333335,
848
+ "grad_norm": 0.03126470744609833,
849
+ "learning_rate": 1.4555555555555556e-05,
850
+ "loss": 0.0077,
851
+ "step": 9500
852
+ },
853
+ {
854
+ "epoch": 0.256,
855
+ "grad_norm": 0.6931378841400146,
856
+ "learning_rate": 1.4533333333333334e-05,
857
+ "loss": 0.0077,
858
+ "step": 9600
859
+ },
860
+ {
861
+ "epoch": 0.25866666666666666,
862
+ "grad_norm": 0.026384815573692322,
863
+ "learning_rate": 1.4511111111111111e-05,
864
+ "loss": 0.0091,
865
+ "step": 9700
866
+ },
867
+ {
868
+ "epoch": 0.2613333333333333,
869
+ "grad_norm": 0.030717667192220688,
870
+ "learning_rate": 1.448888888888889e-05,
871
+ "loss": 0.0147,
872
+ "step": 9800
873
+ },
874
+ {
875
+ "epoch": 0.264,
876
+ "grad_norm": 0.02856365405023098,
877
+ "learning_rate": 1.4466666666666667e-05,
878
+ "loss": 0.0146,
879
+ "step": 9900
880
+ },
881
+ {
882
+ "epoch": 0.26666666666666666,
883
+ "grad_norm": 0.07391875237226486,
884
+ "learning_rate": 1.4444444444444444e-05,
885
+ "loss": 0.0072,
886
+ "step": 10000
887
+ },
888
+ {
889
+ "epoch": 0.2693333333333333,
890
+ "grad_norm": 0.026893191039562225,
891
+ "learning_rate": 1.4422222222222223e-05,
892
+ "loss": 0.0071,
893
+ "step": 10100
894
+ },
895
+ {
896
+ "epoch": 0.272,
897
+ "grad_norm": 0.04391603171825409,
898
+ "learning_rate": 1.44e-05,
899
+ "loss": 0.0124,
900
+ "step": 10200
901
+ },
902
+ {
903
+ "epoch": 0.27466666666666667,
904
+ "grad_norm": 0.03955280780792236,
905
+ "learning_rate": 1.4377777777777779e-05,
906
+ "loss": 0.0099,
907
+ "step": 10300
908
+ },
909
+ {
910
+ "epoch": 0.2773333333333333,
911
+ "grad_norm": 0.031186288222670555,
912
+ "learning_rate": 1.4355555555555556e-05,
913
+ "loss": 0.0099,
914
+ "step": 10400
915
+ },
916
+ {
917
+ "epoch": 0.28,
918
+ "grad_norm": 0.04440930485725403,
919
+ "learning_rate": 1.4333333333333334e-05,
920
+ "loss": 0.0042,
921
+ "step": 10500
922
+ },
923
+ {
924
+ "epoch": 0.2826666666666667,
925
+ "grad_norm": 0.035457074642181396,
926
+ "learning_rate": 1.4311111111111111e-05,
927
+ "loss": 0.0179,
928
+ "step": 10600
929
+ },
930
+ {
931
+ "epoch": 0.2853333333333333,
932
+ "grad_norm": 0.042004115879535675,
933
+ "learning_rate": 1.4288888888888889e-05,
934
+ "loss": 0.007,
935
+ "step": 10700
936
+ },
937
+ {
938
+ "epoch": 0.288,
939
+ "grad_norm": 0.0251617394387722,
940
+ "learning_rate": 1.4266666666666667e-05,
941
+ "loss": 0.0071,
942
+ "step": 10800
943
+ },
944
+ {
945
+ "epoch": 0.2906666666666667,
946
+ "grad_norm": 0.027697479352355003,
947
+ "learning_rate": 1.4244444444444444e-05,
948
+ "loss": 0.0046,
949
+ "step": 10900
950
+ },
951
+ {
952
+ "epoch": 0.29333333333333333,
953
+ "grad_norm": 0.035603832453489304,
954
+ "learning_rate": 1.4222222222222224e-05,
955
+ "loss": 0.0114,
956
+ "step": 11000
957
+ },
958
+ {
959
+ "epoch": 0.296,
960
+ "grad_norm": 0.2773999571800232,
961
+ "learning_rate": 1.42e-05,
962
+ "loss": 0.0195,
963
+ "step": 11100
964
+ },
965
+ {
966
+ "epoch": 0.2986666666666667,
967
+ "grad_norm": 0.041750043630599976,
968
+ "learning_rate": 1.4177777777777779e-05,
969
+ "loss": 0.0095,
970
+ "step": 11200
971
+ },
972
+ {
973
+ "epoch": 0.30133333333333334,
974
+ "grad_norm": 0.03574356436729431,
975
+ "learning_rate": 1.4155555555555556e-05,
976
+ "loss": 0.0063,
977
+ "step": 11300
978
+ }
979
+ ],
980
+ "logging_steps": 100,
981
+ "max_steps": 75000,
982
+ "num_input_tokens_seen": 0,
983
+ "num_train_epochs": 2,
984
+ "save_steps": 100,
985
+ "stateful_callbacks": {
986
+ "TrainerControl": {
987
+ "args": {
988
+ "should_epoch_stop": false,
989
+ "should_evaluate": false,
990
+ "should_log": false,
991
+ "should_save": true,
992
+ "should_training_stop": false
993
+ },
994
+ "attributes": {}
995
+ }
996
+ },
997
+ "total_flos": 4.1935656459064443e+18,
998
+ "train_batch_size": 4,
999
+ "trial_name": null,
1000
+ "trial_params": null
1001
+ }
checkpoint-11300/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c40a4c02bff4c325a39b29bbad54a640a82b1bc361c29cc2656ac8d29cf43eaa
3
+ size 5432
checkpoint-11400/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Dorn4449/CyberSentinel-Mistral-7B-v3.8
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.11.1
checkpoint-11400/adapter_config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "Dorn4449/CyberSentinel-Mistral-7B-v3.8",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 8,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "o_proj",
24
+ "v_proj",
25
+ "k_proj",
26
+ "q_proj"
27
+ ],
28
+ "task_type": "CAUSAL_LM",
29
+ "use_dora": false,
30
+ "use_rslora": false
31
+ }
checkpoint-11400/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e521d8364841bb17ab8ebaa0fc1ec455803cb2deb0f55b7a486cf2251398b69
3
+ size 27297032
checkpoint-11400/checkpoint-6000/adapter_config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "Dorn4449/CyberSentinel-Mistral-7B-v3.8",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 8,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "o_proj",
24
+ "v_proj",
25
+ "k_proj",
26
+ "q_proj"
27
+ ],
28
+ "task_type": "CAUSAL_LM",
29
+ "use_dora": false,
30
+ "use_rslora": false
31
+ }
checkpoint-11400/checkpoint-6000/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:286625e19ea92f6bcbc715bf5d40bac2ac85da8b9ecb31679d50c38cb4b4b694
3
+ size 27297032
checkpoint-11400/checkpoint-6100/adapter_config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "Dorn4449/CyberSentinel-Mistral-7B-v3.8",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 8,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "o_proj",
24
+ "v_proj",
25
+ "k_proj",
26
+ "q_proj"
27
+ ],
28
+ "task_type": "CAUSAL_LM",
29
+ "use_dora": false,
30
+ "use_rslora": false
31
+ }
checkpoint-11400/checkpoint-6100/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0a6ee2942c85113263a2331006fa30db088217df193b0fca3de2eebe282399b
3
+ size 27297032
checkpoint-11400/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a1cb477f3bf32d0e2fb0fe1174f430ffb818f31be2b490688f3e4c2cb13b03d3
3
+ size 54744314
checkpoint-11400/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b928e6b6c299335f6b9be16f0b17b28a18fceb39576812717829e79c7cab6174
3
+ size 14244
checkpoint-11400/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab6b8ee101afb64e8443bc7bf27a96edb471951013dafbb400a0d2ceba714792
3
+ size 1064
checkpoint-11400/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
checkpoint-11400/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
3
+ size 493443
checkpoint-11400/tokenizer_config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "additional_special_tokens": [],
32
+ "bos_token": "<s>",
33
+ "chat_template": "{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content'] %}\n {%- set loop_messages = messages[1:] %}\n{%- else %}\n {%- set loop_messages = messages %}\n{%- endif %}\n\n{{- bos_token }}\n{%- for message in loop_messages %}\n {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}\n {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}\n {%- endif %}\n {%- if message['role'] == 'user' %}\n {%- if loop.first and system_message is defined %}\n {{- ' [INST] ' + system_message + '\\n\\n' + message['content'] + ' [/INST]' }}\n {%- else %}\n {{- ' [INST] ' + message['content'] + ' [/INST]' }}\n {%- endif %}\n {%- elif message['role'] == 'assistant' %}\n {{- ' ' + message['content'] + eos_token}}\n {%- else %}\n {{- raise_exception('Only user and assistant roles are supported, with the exception of an initial optional system message!') }}\n {%- endif %}\n{%- endfor %}\n",
34
+ "clean_up_tokenization_spaces": false,
35
+ "eos_token": "</s>",
36
+ "legacy": false,
37
+ "model_max_length": 1000000000000000019884624838656,
38
+ "pad_token": "</s>",
39
+ "sp_model_kwargs": {},
40
+ "spaces_between_special_tokens": false,
41
+ "tokenizer_class": "LlamaTokenizer",
42
+ "unk_token": "<unk>",
43
+ "use_default_system_prompt": false
44
+ }
checkpoint-11400/trainer_state.json ADDED
@@ -0,0 +1,1008 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.304,
5
+ "eval_steps": 500,
6
+ "global_step": 11400,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "entropy": 1.084044404476881,
13
+ "epoch": 0.010666666666666666,
14
+ "grad_norm": 0.796875,
15
+ "learning_rate": 7.92e-07,
16
+ "loss": 1.3782466125488282,
17
+ "mean_token_accuracy": 0.7083857330679894,
18
+ "num_tokens": 986491.0,
19
+ "step": 100
20
+ },
21
+ {
22
+ "entropy": 1.1007767178118228,
23
+ "epoch": 0.021333333333333333,
24
+ "grad_norm": 0.3125,
25
+ "learning_rate": 1.592e-06,
26
+ "loss": 1.2902149963378906,
27
+ "mean_token_accuracy": 0.7171274860203266,
28
+ "num_tokens": 1952567.0,
29
+ "step": 200
30
+ },
31
+ {
32
+ "entropy": 1.0802835547924041,
33
+ "epoch": 0.032,
34
+ "grad_norm": 0.28515625,
35
+ "learning_rate": 2.392e-06,
36
+ "loss": 1.170371322631836,
37
+ "mean_token_accuracy": 0.7354862231016159,
38
+ "num_tokens": 2935479.0,
39
+ "step": 300
40
+ },
41
+ {
42
+ "entropy": 1.0573877203464508,
43
+ "epoch": 0.042666666666666665,
44
+ "grad_norm": 0.23046875,
45
+ "learning_rate": 3.192e-06,
46
+ "loss": 1.062843780517578,
47
+ "mean_token_accuracy": 0.7536254300177098,
48
+ "num_tokens": 3920344.0,
49
+ "step": 400
50
+ },
51
+ {
52
+ "entropy": 1.0149462178349495,
53
+ "epoch": 0.05333333333333334,
54
+ "grad_norm": 0.212890625,
55
+ "learning_rate": 3.992e-06,
56
+ "loss": 0.9590372467041015,
57
+ "mean_token_accuracy": 0.7738636130094528,
58
+ "num_tokens": 4899217.0,
59
+ "step": 500
60
+ },
61
+ {
62
+ "entropy": 0.8931056271493435,
63
+ "epoch": 0.064,
64
+ "grad_norm": 0.28125,
65
+ "learning_rate": 4.792e-06,
66
+ "loss": 0.7998863983154297,
67
+ "mean_token_accuracy": 0.8012332341074944,
68
+ "num_tokens": 5883391.0,
69
+ "step": 600
70
+ },
71
+ {
72
+ "entropy": 0.6136953190714121,
73
+ "epoch": 0.07466666666666667,
74
+ "grad_norm": 1.1015625,
75
+ "learning_rate": 5.592000000000001e-06,
76
+ "loss": 0.5131131362915039,
77
+ "mean_token_accuracy": 0.866335421949625,
78
+ "num_tokens": 6857125.0,
79
+ "step": 700
80
+ },
81
+ {
82
+ "entropy": 0.2726459547691047,
83
+ "epoch": 0.08533333333333333,
84
+ "grad_norm": 0.474609375,
85
+ "learning_rate": 6.392e-06,
86
+ "loss": 0.18439870834350586,
87
+ "mean_token_accuracy": 0.9579810079932213,
88
+ "num_tokens": 7831290.0,
89
+ "step": 800
90
+ },
91
+ {
92
+ "entropy": 0.11284050881862641,
93
+ "epoch": 0.096,
94
+ "grad_norm": 0.1875,
95
+ "learning_rate": 7.192e-06,
96
+ "loss": 0.05271556854248047,
97
+ "mean_token_accuracy": 0.9904264670610428,
98
+ "num_tokens": 8815417.0,
99
+ "step": 900
100
+ },
101
+ {
102
+ "entropy": 0.068436967888847,
103
+ "epoch": 0.10666666666666667,
104
+ "grad_norm": 0.125,
105
+ "learning_rate": 7.992e-06,
106
+ "loss": 0.02549468755722046,
107
+ "mean_token_accuracy": 0.9951217715442181,
108
+ "num_tokens": 9782033.0,
109
+ "step": 1000
110
+ },
111
+ {
112
+ "entropy": 0.052372096767649055,
113
+ "epoch": 0.11733333333333333,
114
+ "grad_norm": 0.1318359375,
115
+ "learning_rate": 8.792e-06,
116
+ "loss": 0.017259199619293213,
117
+ "mean_token_accuracy": 0.9962555834650993,
118
+ "num_tokens": 10759962.0,
119
+ "step": 1100
120
+ },
121
+ {
122
+ "entropy": 0.03862797610927373,
123
+ "epoch": 0.128,
124
+ "grad_norm": 0.0867946669459343,
125
+ "learning_rate": 9.591999999999999e-06,
126
+ "loss": 0.013324768543243408,
127
+ "mean_token_accuracy": 0.9968111206591129,
128
+ "num_tokens": 987986.0,
129
+ "step": 1200
130
+ },
131
+ {
132
+ "entropy": 0.032093781144358215,
133
+ "epoch": 0.13866666666666666,
134
+ "grad_norm": 0.037592533975839615,
135
+ "learning_rate": 1.0392e-05,
136
+ "loss": 0.013392001390457153,
137
+ "mean_token_accuracy": 0.9965760576725006,
138
+ "num_tokens": 1968127.0,
139
+ "step": 1300
140
+ },
141
+ {
142
+ "entropy": 0.031904329673852774,
143
+ "epoch": 0.14933333333333335,
144
+ "grad_norm": 0.10515860468149185,
145
+ "learning_rate": 1.1192e-05,
146
+ "loss": 0.016373103857040404,
147
+ "mean_token_accuracy": 0.9955054900050163,
148
+ "num_tokens": 2965677.0,
149
+ "step": 1400
150
+ },
151
+ {
152
+ "entropy": 0.0265167937008664,
153
+ "epoch": 0.16,
154
+ "grad_norm": 0.12689532339572906,
155
+ "learning_rate": 1.1992e-05,
156
+ "loss": 0.011554093360900878,
157
+ "mean_token_accuracy": 0.996508517563343,
158
+ "num_tokens": 3938144.0,
159
+ "step": 1500
160
+ },
161
+ {
162
+ "entropy": 0.024146563813555986,
163
+ "epoch": 0.17066666666666666,
164
+ "grad_norm": 0.03401608020067215,
165
+ "learning_rate": 1.2792e-05,
166
+ "loss": 0.010398292541503906,
167
+ "mean_token_accuracy": 0.9966818282008171,
168
+ "num_tokens": 4897019.0,
169
+ "step": 1600
170
+ },
171
+ {
172
+ "entropy": 0.025755486716516316,
173
+ "epoch": 0.18133333333333335,
174
+ "grad_norm": 0.05834396556019783,
175
+ "learning_rate": 1.3592000000000001e-05,
176
+ "loss": 0.012847075462341309,
177
+ "mean_token_accuracy": 0.9960671140253544,
178
+ "num_tokens": 5880125.0,
179
+ "step": 1700
180
+ },
181
+ {
182
+ "entropy": 0.023504739217460154,
183
+ "epoch": 0.192,
184
+ "grad_norm": 0.09117468446493149,
185
+ "learning_rate": 1.4392e-05,
186
+ "loss": 0.011869451999664306,
187
+ "mean_token_accuracy": 0.9964955732226372,
188
+ "num_tokens": 6867360.0,
189
+ "step": 1800
190
+ },
191
+ {
192
+ "entropy": 0.022944011739455164,
193
+ "epoch": 0.20266666666666666,
194
+ "grad_norm": 0.046192608773708344,
195
+ "learning_rate": 1.4978666666666668e-05,
196
+ "loss": 0.010620262622833252,
197
+ "mean_token_accuracy": 0.9964672869443894,
198
+ "num_tokens": 7840472.0,
199
+ "step": 1900
200
+ },
201
+ {
202
+ "entropy": 0.018894561287015676,
203
+ "epoch": 0.21333333333333335,
204
+ "grad_norm": 0.02761668898165226,
205
+ "learning_rate": 1.4889777777777778e-05,
206
+ "loss": 0.0067046540975570675,
207
+ "mean_token_accuracy": 0.9975774252414703,
208
+ "num_tokens": 8827474.0,
209
+ "step": 2000
210
+ },
211
+ {
212
+ "entropy": 0.020934174589347095,
213
+ "epoch": 0.224,
214
+ "grad_norm": 0.02931295707821846,
215
+ "learning_rate": 1.4800888888888889e-05,
216
+ "loss": 0.010140993595123292,
217
+ "mean_token_accuracy": 0.996714953482151,
218
+ "num_tokens": 9808475.0,
219
+ "step": 2100
220
+ },
221
+ {
222
+ "entropy": 0.020719884738791734,
223
+ "epoch": 0.23466666666666666,
224
+ "grad_norm": 0.02652113325893879,
225
+ "learning_rate": 1.4712e-05,
226
+ "loss": 0.010557392835617066,
227
+ "mean_token_accuracy": 0.9969246552884579,
228
+ "num_tokens": 10782725.0,
229
+ "step": 2200
230
+ },
231
+ {
232
+ "entropy": 0.020559412932489068,
233
+ "epoch": 0.24533333333333332,
234
+ "grad_norm": 0.032906673848629,
235
+ "learning_rate": 1.4623111111111113e-05,
236
+ "loss": 0.009789772033691406,
237
+ "mean_token_accuracy": 0.9966975942254066,
238
+ "num_tokens": 11769893.0,
239
+ "step": 2300
240
+ },
241
+ {
242
+ "entropy": 0.018388252432923764,
243
+ "epoch": 0.256,
244
+ "grad_norm": 0.08162333816289902,
245
+ "learning_rate": 1.4534222222222222e-05,
246
+ "loss": 0.007909480929374695,
247
+ "mean_token_accuracy": 0.9972486282885075,
248
+ "num_tokens": 12753597.0,
249
+ "step": 2400
250
+ },
251
+ {
252
+ "entropy": 0.02204789153067395,
253
+ "epoch": 0.26666666666666666,
254
+ "grad_norm": 0.023511990904808044,
255
+ "learning_rate": 1.4445333333333334e-05,
256
+ "loss": 0.011945382356643677,
257
+ "mean_token_accuracy": 0.9963182592391968,
258
+ "num_tokens": 13739044.0,
259
+ "step": 2500
260
+ },
261
+ {
262
+ "entropy": 0.02037038065260276,
263
+ "epoch": 0.2773333333333333,
264
+ "grad_norm": 0.02014540508389473,
265
+ "learning_rate": 1.4356444444444446e-05,
266
+ "loss": 0.010165022611618042,
267
+ "mean_token_accuracy": 0.9967658732831478,
268
+ "num_tokens": 14709501.0,
269
+ "step": 2600
270
+ },
271
+ {
272
+ "entropy": 0.01947357293218374,
273
+ "epoch": 0.288,
274
+ "grad_norm": 0.01829116977751255,
275
+ "learning_rate": 1.4267555555555555e-05,
276
+ "loss": 0.009100326895713806,
277
+ "mean_token_accuracy": 0.9969173397123814,
278
+ "num_tokens": 15693309.0,
279
+ "step": 2700
280
+ },
281
+ {
282
+ "entropy": 0.021565243480727078,
283
+ "epoch": 0.2986666666666667,
284
+ "grad_norm": 0.031732361763715744,
285
+ "learning_rate": 1.4178666666666667e-05,
286
+ "loss": 0.01107092022895813,
287
+ "mean_token_accuracy": 0.9963353677093982,
288
+ "num_tokens": 985520.0,
289
+ "step": 2800
290
+ },
291
+ {
292
+ "entropy": 0.02136099517461844,
293
+ "epoch": 0.30933333333333335,
294
+ "grad_norm": 0.03870174661278725,
295
+ "learning_rate": 1.4089777777777779e-05,
296
+ "loss": 0.011569523811340332,
297
+ "mean_token_accuracy": 0.9964252272248268,
298
+ "num_tokens": 1969572.0,
299
+ "step": 2900
300
+ },
301
+ {
302
+ "entropy": 0.019074252294376492,
303
+ "epoch": 0.32,
304
+ "grad_norm": 0.01954864338040352,
305
+ "learning_rate": 1.4000888888888888e-05,
306
+ "loss": 0.008727578520774841,
307
+ "mean_token_accuracy": 0.9970971086621284,
308
+ "num_tokens": 2945404.0,
309
+ "step": 3000
310
+ },
311
+ {
312
+ "entropy": 0.018130726229865102,
313
+ "epoch": 0.33066666666666666,
314
+ "grad_norm": 0.01979902759194374,
315
+ "learning_rate": 1.3912e-05,
316
+ "loss": 0.007898266315460206,
317
+ "mean_token_accuracy": 0.9971335357427598,
318
+ "num_tokens": 3925122.0,
319
+ "step": 3100
320
+ },
321
+ {
322
+ "entropy": 0.019400757937546587,
323
+ "epoch": 0.3413333333333333,
324
+ "grad_norm": 0.030273959040641785,
325
+ "learning_rate": 1.0980206234443522e-05,
326
+ "loss": 0.009979066749413809,
327
+ "mean_token_accuracy": 0.9968534293584526,
328
+ "num_tokens": 945781.0,
329
+ "step": 3200
330
+ },
331
+ {
332
+ "entropy": 0.021496493350714446,
333
+ "epoch": 0.352,
334
+ "grad_norm": 0.023218955844640732,
335
+ "learning_rate": 1.0802417921061989e-05,
336
+ "loss": 0.011733214855194091,
337
+ "mean_token_accuracy": 0.996385814100504,
338
+ "num_tokens": 1919451.0,
339
+ "step": 3300
340
+ },
341
+ {
342
+ "entropy": 0.01932728004641831,
343
+ "epoch": 0.3626666666666667,
344
+ "grad_norm": 0.014179096557199955,
345
+ "learning_rate": 1.0624629607680455e-05,
346
+ "loss": 0.00976854920387268,
347
+ "mean_token_accuracy": 0.9968989025056362,
348
+ "num_tokens": 2892220.0,
349
+ "step": 3400
350
+ },
351
+ {
352
+ "entropy": 0.02088767145993188,
353
+ "epoch": 0.37333333333333335,
354
+ "grad_norm": 0.032297272235155106,
355
+ "learning_rate": 1.0446841294298921e-05,
356
+ "loss": 0.011026575565338134,
357
+ "mean_token_accuracy": 0.9964779444038868,
358
+ "num_tokens": 3878184.0,
359
+ "step": 3500
360
+ },
361
+ {
362
+ "entropy": 0.018377634914308463,
363
+ "epoch": 0.384,
364
+ "grad_norm": 0.012006225995719433,
365
+ "learning_rate": 1.3467555555555556e-05,
366
+ "loss": 0.008617221661235975,
367
+ "mean_token_accuracy": 0.9970365500320559,
368
+ "num_tokens": 904843.0,
369
+ "step": 3600
370
+ },
371
+ {
372
+ "entropy": 0.016050403744447977,
373
+ "epoch": 0.39466666666666667,
374
+ "grad_norm": 0.011128585785627365,
375
+ "learning_rate": 1.3378666666666666e-05,
376
+ "loss": 0.006250782608985901,
377
+ "mean_token_accuracy": 0.9976212471723557,
378
+ "num_tokens": 1878860.0,
379
+ "step": 3700
380
+ },
381
+ {
382
+ "entropy": 0.01809800002258271,
383
+ "epoch": 0.4053333333333333,
384
+ "grad_norm": 0.01952100545167923,
385
+ "learning_rate": 1.3289777777777778e-05,
386
+ "loss": 0.008064679503440857,
387
+ "mean_token_accuracy": 0.9971583542227745,
388
+ "num_tokens": 2860744.0,
389
+ "step": 3800
390
+ },
391
+ {
392
+ "entropy": 0.019823117861524225,
393
+ "epoch": 0.416,
394
+ "grad_norm": 0.011735321022570133,
395
+ "learning_rate": 1.3200888888888889e-05,
396
+ "loss": 0.010386246442794799,
397
+ "mean_token_accuracy": 0.9966941741108895,
398
+ "num_tokens": 3831565.0,
399
+ "step": 3900
400
+ },
401
+ {
402
+ "entropy": 0.02049923066298889,
403
+ "epoch": 0.4266666666666667,
404
+ "grad_norm": 0.006284466944634914,
405
+ "learning_rate": 1.3112e-05,
406
+ "loss": 0.01045264061107192,
407
+ "mean_token_accuracy": 0.9964593903616418,
408
+ "num_tokens": 849872.0,
409
+ "step": 4000
410
+ },
411
+ {
412
+ "entropy": 0.017120514765847476,
413
+ "epoch": 0.43733333333333335,
414
+ "grad_norm": 0.010025433264672756,
415
+ "learning_rate": 1.3023111111111111e-05,
416
+ "loss": 0.007535084486007691,
417
+ "mean_token_accuracy": 0.9973040929436684,
418
+ "num_tokens": 1826365.0,
419
+ "step": 4100
420
+ },
421
+ {
422
+ "entropy": 0.01937343619065359,
423
+ "epoch": 0.448,
424
+ "grad_norm": 0.008356385864317417,
425
+ "learning_rate": 1.2934222222222222e-05,
426
+ "loss": 0.010190980434417725,
427
+ "mean_token_accuracy": 0.9967715987563133,
428
+ "num_tokens": 2802749.0,
429
+ "step": 4200
430
+ },
431
+ {
432
+ "entropy": 0.017232202125014737,
433
+ "epoch": 0.45866666666666667,
434
+ "grad_norm": 0.011389357037842274,
435
+ "learning_rate": 1.2845333333333334e-05,
436
+ "loss": 0.007943087816238403,
437
+ "mean_token_accuracy": 0.9973043432831764,
438
+ "num_tokens": 3802758.0,
439
+ "step": 4300
440
+ },
441
+ {
442
+ "entropy": 0.022822092252748984,
443
+ "epoch": 0.4693333333333333,
444
+ "grad_norm": 0.017312563955783844,
445
+ "learning_rate": 1.2756444444444444e-05,
446
+ "loss": 0.012838510819423346,
447
+ "mean_token_accuracy": 0.9959132893953795,
448
+ "num_tokens": 789736.0,
449
+ "step": 4400
450
+ },
451
+ {
452
+ "entropy": 0.0184966369275935,
453
+ "epoch": 0.48,
454
+ "grad_norm": 0.04721185564994812,
455
+ "learning_rate": 1.2667555555555557e-05,
456
+ "loss": 0.009318522214889526,
457
+ "mean_token_accuracy": 0.9968563948571681,
458
+ "num_tokens": 1767600.0,
459
+ "step": 4500
460
+ },
461
+ {
462
+ "entropy": 0.021548866296652706,
463
+ "epoch": 0.49066666666666664,
464
+ "grad_norm": 0.9198243021965027,
465
+ "learning_rate": 1.2578666666666667e-05,
466
+ "loss": 0.013902791738510133,
467
+ "mean_token_accuracy": 0.9959381237626076,
468
+ "num_tokens": 2754982.0,
469
+ "step": 4600
470
+ },
471
+ {
472
+ "entropy": 0.0234364516264759,
473
+ "epoch": 0.5013333333333333,
474
+ "grad_norm": 0.011652004905045033,
475
+ "learning_rate": 1.2489777777777779e-05,
476
+ "loss": 0.011085785627365112,
477
+ "mean_token_accuracy": 0.9966410009562969,
478
+ "num_tokens": 3730109.0,
479
+ "step": 4700
480
+ },
481
+ {
482
+ "entropy": 0.017517962178529856,
483
+ "epoch": 0.512,
484
+ "grad_norm": 0.008591280318796635,
485
+ "learning_rate": 1.240088888888889e-05,
486
+ "loss": 0.0068675024168831965,
487
+ "mean_token_accuracy": 0.9973819294533173,
488
+ "num_tokens": 753786.0,
489
+ "step": 4800
490
+ },
491
+ {
492
+ "entropy": 0.019476991441333667,
493
+ "epoch": 0.5226666666666666,
494
+ "grad_norm": 0.013633953407406807,
495
+ "learning_rate": 1.2312e-05,
496
+ "loss": 0.009302983283996582,
497
+ "mean_token_accuracy": 0.9968732745945453,
498
+ "num_tokens": 1732738.0,
499
+ "step": 4900
500
+ },
501
+ {
502
+ "entropy": 0.020917424112558366,
503
+ "epoch": 0.5333333333333333,
504
+ "grad_norm": 0.01434489618986845,
505
+ "learning_rate": 1.2223111111111112e-05,
506
+ "loss": 0.010165597200393678,
507
+ "mean_token_accuracy": 0.9965643344819546,
508
+ "num_tokens": 2702069.0,
509
+ "step": 5000
510
+ },
511
+ {
512
+ "entropy": 0.017627096675569193,
513
+ "epoch": 0.544,
514
+ "grad_norm": 0.010028124786913395,
515
+ "learning_rate": 1.2134222222222223e-05,
516
+ "loss": 0.008581981062889099,
517
+ "mean_token_accuracy": 0.9972231885790825,
518
+ "num_tokens": 3692514.0,
519
+ "step": 5100
520
+ },
521
+ {
522
+ "entropy": 0.018209505494293832,
523
+ "epoch": 0.5546666666666666,
524
+ "grad_norm": 0.010386968962848186,
525
+ "learning_rate": 1.2045333333333333e-05,
526
+ "loss": 0.00814423689971099,
527
+ "mean_token_accuracy": 0.9972281313023051,
528
+ "num_tokens": 739184.0,
529
+ "step": 5200
530
+ },
531
+ {
532
+ "entropy": 0.02268622429575771,
533
+ "epoch": 0.5653333333333334,
534
+ "grad_norm": 0.01215888187289238,
535
+ "learning_rate": 1.1956444444444445e-05,
536
+ "loss": 0.012614715099334716,
537
+ "mean_token_accuracy": 0.9960222035646439,
538
+ "num_tokens": 1710404.0,
539
+ "step": 5300
540
+ },
541
+ {
542
+ "entropy": 0.020103816259652376,
543
+ "epoch": 0.576,
544
+ "grad_norm": 0.014174265787005424,
545
+ "learning_rate": 1.1867555555555556e-05,
546
+ "loss": 0.010200117826461791,
547
+ "mean_token_accuracy": 0.9967270520329475,
548
+ "num_tokens": 2686777.0,
549
+ "step": 5400
550
+ },
551
+ {
552
+ "entropy": 0.020467385197989643,
553
+ "epoch": 0.5866666666666667,
554
+ "grad_norm": 0.014257642440497875,
555
+ "learning_rate": 1.1778666666666666e-05,
556
+ "loss": 0.010185201168060303,
557
+ "mean_token_accuracy": 0.9966062535345555,
558
+ "num_tokens": 3655445.0,
559
+ "step": 5500
560
+ },
561
+ {
562
+ "entropy": 0.020591240752474878,
563
+ "epoch": 0.5973333333333334,
564
+ "grad_norm": 0.011690130457282066,
565
+ "learning_rate": 1.1689777777777778e-05,
566
+ "loss": 0.010783259419427402,
567
+ "mean_token_accuracy": 0.9964483192433482,
568
+ "num_tokens": 686164.0,
569
+ "step": 5600
570
+ },
571
+ {
572
+ "entropy": 0.0205141630244907,
573
+ "epoch": 0.608,
574
+ "grad_norm": 0.006469405256211758,
575
+ "learning_rate": 1.160088888888889e-05,
576
+ "loss": 0.010492314100265503,
577
+ "mean_token_accuracy": 0.9966046234965324,
578
+ "num_tokens": 1662167.0,
579
+ "step": 5700
580
+ },
581
+ {
582
+ "entropy": 0.020147288321750237,
583
+ "epoch": 0.6186666666666667,
584
+ "grad_norm": 0.01614871807396412,
585
+ "learning_rate": 1.1512e-05,
586
+ "loss": 0.010045292377471924,
587
+ "mean_token_accuracy": 0.9966985350847244,
588
+ "num_tokens": 2643221.0,
589
+ "step": 5800
590
+ },
591
+ {
592
+ "entropy": 0.01912423676229082,
593
+ "epoch": 0.6293333333333333,
594
+ "grad_norm": 0.010019957087934017,
595
+ "learning_rate": 1.1423111111111111e-05,
596
+ "loss": 0.009680591821670533,
597
+ "mean_token_accuracy": 0.9968334528803825,
598
+ "num_tokens": 3611899.0,
599
+ "step": 5900
600
+ },
601
+ {
602
+ "epoch": 0.16,
603
+ "grad_norm": 1.419013500213623,
604
+ "learning_rate": 1.2e-05,
605
+ "loss": 0.0443,
606
+ "step": 6000
607
+ },
608
+ {
609
+ "epoch": 0.16266666666666665,
610
+ "grad_norm": 0.0630747601389885,
611
+ "learning_rate": 1.22e-05,
612
+ "loss": 0.0066,
613
+ "step": 6100
614
+ },
615
+ {
616
+ "epoch": 0.16533333333333333,
617
+ "grad_norm": 0.08701734989881516,
618
+ "learning_rate": 1.24e-05,
619
+ "loss": 0.0171,
620
+ "step": 6200
621
+ },
622
+ {
623
+ "epoch": 0.168,
624
+ "grad_norm": 0.7274932861328125,
625
+ "learning_rate": 1.26e-05,
626
+ "loss": 0.0065,
627
+ "step": 6300
628
+ },
629
+ {
630
+ "epoch": 0.17066666666666666,
631
+ "grad_norm": 0.07388290017843246,
632
+ "learning_rate": 1.2800000000000001e-05,
633
+ "loss": 0.0087,
634
+ "step": 6400
635
+ },
636
+ {
637
+ "epoch": 0.17333333333333334,
638
+ "grad_norm": 0.045733992010354996,
639
+ "learning_rate": 1.3000000000000001e-05,
640
+ "loss": 0.0114,
641
+ "step": 6500
642
+ },
643
+ {
644
+ "epoch": 0.176,
645
+ "grad_norm": 0.4374091625213623,
646
+ "learning_rate": 1.32e-05,
647
+ "loss": 0.0139,
648
+ "step": 6600
649
+ },
650
+ {
651
+ "epoch": 0.17866666666666667,
652
+ "grad_norm": 0.050165899097919464,
653
+ "learning_rate": 1.34e-05,
654
+ "loss": 0.0145,
655
+ "step": 6700
656
+ },
657
+ {
658
+ "epoch": 0.18133333333333335,
659
+ "grad_norm": 0.047875139862298965,
660
+ "learning_rate": 1.36e-05,
661
+ "loss": 0.0101,
662
+ "step": 6800
663
+ },
664
+ {
665
+ "epoch": 0.184,
666
+ "grad_norm": 0.06959123909473419,
667
+ "learning_rate": 1.3800000000000002e-05,
668
+ "loss": 0.0126,
669
+ "step": 6900
670
+ },
671
+ {
672
+ "epoch": 0.18666666666666668,
673
+ "grad_norm": 0.0340999960899353,
674
+ "learning_rate": 1.4e-05,
675
+ "loss": 0.0084,
676
+ "step": 7000
677
+ },
678
+ {
679
+ "epoch": 0.18933333333333333,
680
+ "grad_norm": 0.049361471086740494,
681
+ "learning_rate": 1.42e-05,
682
+ "loss": 0.0123,
683
+ "step": 7100
684
+ },
685
+ {
686
+ "epoch": 0.192,
687
+ "grad_norm": 0.11002720147371292,
688
+ "learning_rate": 1.44e-05,
689
+ "loss": 0.0111,
690
+ "step": 7200
691
+ },
692
+ {
693
+ "epoch": 0.19466666666666665,
694
+ "grad_norm": 0.04423600062727928,
695
+ "learning_rate": 1.46e-05,
696
+ "loss": 0.0082,
697
+ "step": 7300
698
+ },
699
+ {
700
+ "epoch": 0.19733333333333333,
701
+ "grad_norm": 0.052411410957574844,
702
+ "learning_rate": 1.48e-05,
703
+ "loss": 0.0042,
704
+ "step": 7400
705
+ },
706
+ {
707
+ "epoch": 0.2,
708
+ "grad_norm": 0.045492108911275864,
709
+ "learning_rate": 1.5e-05,
710
+ "loss": 0.02,
711
+ "step": 7500
712
+ },
713
+ {
714
+ "epoch": 0.20266666666666666,
715
+ "grad_norm": 0.07257890701293945,
716
+ "learning_rate": 1.4977777777777778e-05,
717
+ "loss": 0.0109,
718
+ "step": 7600
719
+ },
720
+ {
721
+ "epoch": 0.20533333333333334,
722
+ "grad_norm": 0.0558447502553463,
723
+ "learning_rate": 1.4955555555555556e-05,
724
+ "loss": 0.0065,
725
+ "step": 7700
726
+ },
727
+ {
728
+ "epoch": 0.208,
729
+ "grad_norm": 0.0893528088927269,
730
+ "learning_rate": 1.4933333333333333e-05,
731
+ "loss": 0.0067,
732
+ "step": 7800
733
+ },
734
+ {
735
+ "epoch": 0.21066666666666667,
736
+ "grad_norm": 0.033552590757608414,
737
+ "learning_rate": 1.4911111111111113e-05,
738
+ "loss": 0.0056,
739
+ "step": 7900
740
+ },
741
+ {
742
+ "epoch": 0.21333333333333335,
743
+ "grad_norm": 0.05111644044518471,
744
+ "learning_rate": 1.4888888888888888e-05,
745
+ "loss": 0.0061,
746
+ "step": 8000
747
+ },
748
+ {
749
+ "epoch": 0.216,
750
+ "grad_norm": 0.07947065681219101,
751
+ "learning_rate": 1.4866666666666668e-05,
752
+ "loss": 0.0078,
753
+ "step": 8100
754
+ },
755
+ {
756
+ "epoch": 0.21866666666666668,
757
+ "grad_norm": 0.04130572825670242,
758
+ "learning_rate": 1.4844444444444445e-05,
759
+ "loss": 0.0113,
760
+ "step": 8200
761
+ },
762
+ {
763
+ "epoch": 0.22133333333333333,
764
+ "grad_norm": 0.07327134907245636,
765
+ "learning_rate": 1.4822222222222221e-05,
766
+ "loss": 0.0128,
767
+ "step": 8300
768
+ },
769
+ {
770
+ "epoch": 0.224,
771
+ "grad_norm": 0.03684012219309807,
772
+ "learning_rate": 1.48e-05,
773
+ "loss": 0.0053,
774
+ "step": 8400
775
+ },
776
+ {
777
+ "epoch": 0.22666666666666666,
778
+ "grad_norm": 0.03126613050699234,
779
+ "learning_rate": 1.4777777777777778e-05,
780
+ "loss": 0.0079,
781
+ "step": 8500
782
+ },
783
+ {
784
+ "epoch": 0.22933333333333333,
785
+ "grad_norm": 0.06458591669797897,
786
+ "learning_rate": 1.4755555555555556e-05,
787
+ "loss": 0.016,
788
+ "step": 8600
789
+ },
790
+ {
791
+ "epoch": 0.232,
792
+ "grad_norm": 0.03598800674080849,
793
+ "learning_rate": 1.4733333333333333e-05,
794
+ "loss": 0.0043,
795
+ "step": 8700
796
+ },
797
+ {
798
+ "epoch": 0.23466666666666666,
799
+ "grad_norm": 0.032879263162612915,
800
+ "learning_rate": 1.4711111111111111e-05,
801
+ "loss": 0.01,
802
+ "step": 8800
803
+ },
804
+ {
805
+ "epoch": 0.23733333333333334,
806
+ "grad_norm": 0.04179929941892624,
807
+ "learning_rate": 1.4688888888888889e-05,
808
+ "loss": 0.0088,
809
+ "step": 8900
810
+ },
811
+ {
812
+ "epoch": 0.24,
813
+ "grad_norm": 0.025431055575609207,
814
+ "learning_rate": 1.4666666666666666e-05,
815
+ "loss": 0.0103,
816
+ "step": 9000
817
+ },
818
+ {
819
+ "epoch": 0.24266666666666667,
820
+ "grad_norm": 0.2131260633468628,
821
+ "learning_rate": 1.4644444444444446e-05,
822
+ "loss": 0.0144,
823
+ "step": 9100
824
+ },
825
+ {
826
+ "epoch": 0.24533333333333332,
827
+ "grad_norm": 0.03861390799283981,
828
+ "learning_rate": 1.4622222222222223e-05,
829
+ "loss": 0.0088,
830
+ "step": 9200
831
+ },
832
+ {
833
+ "epoch": 0.248,
834
+ "grad_norm": 0.026965836063027382,
835
+ "learning_rate": 1.46e-05,
836
+ "loss": 0.0102,
837
+ "step": 9300
838
+ },
839
+ {
840
+ "epoch": 0.25066666666666665,
841
+ "grad_norm": 0.02597722038626671,
842
+ "learning_rate": 1.4577777777777778e-05,
843
+ "loss": 0.0053,
844
+ "step": 9400
845
+ },
846
+ {
847
+ "epoch": 0.25333333333333335,
848
+ "grad_norm": 0.03126470744609833,
849
+ "learning_rate": 1.4555555555555556e-05,
850
+ "loss": 0.0077,
851
+ "step": 9500
852
+ },
853
+ {
854
+ "epoch": 0.256,
855
+ "grad_norm": 0.6931378841400146,
856
+ "learning_rate": 1.4533333333333334e-05,
857
+ "loss": 0.0077,
858
+ "step": 9600
859
+ },
860
+ {
861
+ "epoch": 0.25866666666666666,
862
+ "grad_norm": 0.026384815573692322,
863
+ "learning_rate": 1.4511111111111111e-05,
864
+ "loss": 0.0091,
865
+ "step": 9700
866
+ },
867
+ {
868
+ "epoch": 0.2613333333333333,
869
+ "grad_norm": 0.030717667192220688,
870
+ "learning_rate": 1.448888888888889e-05,
871
+ "loss": 0.0147,
872
+ "step": 9800
873
+ },
874
+ {
875
+ "epoch": 0.264,
876
+ "grad_norm": 0.02856365405023098,
877
+ "learning_rate": 1.4466666666666667e-05,
878
+ "loss": 0.0146,
879
+ "step": 9900
880
+ },
881
+ {
882
+ "epoch": 0.26666666666666666,
883
+ "grad_norm": 0.07391875237226486,
884
+ "learning_rate": 1.4444444444444444e-05,
885
+ "loss": 0.0072,
886
+ "step": 10000
887
+ },
888
+ {
889
+ "epoch": 0.2693333333333333,
890
+ "grad_norm": 0.026893191039562225,
891
+ "learning_rate": 1.4422222222222223e-05,
892
+ "loss": 0.0071,
893
+ "step": 10100
894
+ },
895
+ {
896
+ "epoch": 0.272,
897
+ "grad_norm": 0.04391603171825409,
898
+ "learning_rate": 1.44e-05,
899
+ "loss": 0.0124,
900
+ "step": 10200
901
+ },
902
+ {
903
+ "epoch": 0.27466666666666667,
904
+ "grad_norm": 0.03955280780792236,
905
+ "learning_rate": 1.4377777777777779e-05,
906
+ "loss": 0.0099,
907
+ "step": 10300
908
+ },
909
+ {
910
+ "epoch": 0.2773333333333333,
911
+ "grad_norm": 0.031186288222670555,
912
+ "learning_rate": 1.4355555555555556e-05,
913
+ "loss": 0.0099,
914
+ "step": 10400
915
+ },
916
+ {
917
+ "epoch": 0.28,
918
+ "grad_norm": 0.04440930485725403,
919
+ "learning_rate": 1.4333333333333334e-05,
920
+ "loss": 0.0042,
921
+ "step": 10500
922
+ },
923
+ {
924
+ "epoch": 0.2826666666666667,
925
+ "grad_norm": 0.035457074642181396,
926
+ "learning_rate": 1.4311111111111111e-05,
927
+ "loss": 0.0179,
928
+ "step": 10600
929
+ },
930
+ {
931
+ "epoch": 0.2853333333333333,
932
+ "grad_norm": 0.042004115879535675,
933
+ "learning_rate": 1.4288888888888889e-05,
934
+ "loss": 0.007,
935
+ "step": 10700
936
+ },
937
+ {
938
+ "epoch": 0.288,
939
+ "grad_norm": 0.0251617394387722,
940
+ "learning_rate": 1.4266666666666667e-05,
941
+ "loss": 0.0071,
942
+ "step": 10800
943
+ },
944
+ {
945
+ "epoch": 0.2906666666666667,
946
+ "grad_norm": 0.027697479352355003,
947
+ "learning_rate": 1.4244444444444444e-05,
948
+ "loss": 0.0046,
949
+ "step": 10900
950
+ },
951
+ {
952
+ "epoch": 0.29333333333333333,
953
+ "grad_norm": 0.035603832453489304,
954
+ "learning_rate": 1.4222222222222224e-05,
955
+ "loss": 0.0114,
956
+ "step": 11000
957
+ },
958
+ {
959
+ "epoch": 0.296,
960
+ "grad_norm": 0.2773999571800232,
961
+ "learning_rate": 1.42e-05,
962
+ "loss": 0.0195,
963
+ "step": 11100
964
+ },
965
+ {
966
+ "epoch": 0.2986666666666667,
967
+ "grad_norm": 0.041750043630599976,
968
+ "learning_rate": 1.4177777777777779e-05,
969
+ "loss": 0.0095,
970
+ "step": 11200
971
+ },
972
+ {
973
+ "epoch": 0.30133333333333334,
974
+ "grad_norm": 0.03574356436729431,
975
+ "learning_rate": 1.4155555555555556e-05,
976
+ "loss": 0.0063,
977
+ "step": 11300
978
+ },
979
+ {
980
+ "epoch": 0.304,
981
+ "grad_norm": 0.024210890755057335,
982
+ "learning_rate": 1.4133333333333332e-05,
983
+ "loss": 0.0148,
984
+ "step": 11400
985
+ }
986
+ ],
987
+ "logging_steps": 100,
988
+ "max_steps": 75000,
989
+ "num_input_tokens_seen": 0,
990
+ "num_train_epochs": 2,
991
+ "save_steps": 100,
992
+ "stateful_callbacks": {
993
+ "TrainerControl": {
994
+ "args": {
995
+ "should_epoch_stop": false,
996
+ "should_evaluate": false,
997
+ "should_log": false,
998
+ "should_save": true,
999
+ "should_training_stop": false
1000
+ },
1001
+ "attributes": {}
1002
+ }
1003
+ },
1004
+ "total_flos": 4.208037458069029e+18,
1005
+ "train_batch_size": 4,
1006
+ "trial_name": null,
1007
+ "trial_params": null
1008
+ }
checkpoint-11400/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c40a4c02bff4c325a39b29bbad54a640a82b1bc361c29cc2656ac8d29cf43eaa
3
+ size 5432