Dorn4449 commited on
Commit
d94c00e
·
verified ·
1 Parent(s): be3755c

checkpoint step 7000

Browse files
Files changed (45) hide show
  1. checkpoint-6700/README.md +202 -0
  2. checkpoint-6700/adapter_config.json +31 -0
  3. checkpoint-6700/adapter_model.safetensors +3 -0
  4. checkpoint-6700/checkpoint-6000/adapter_config.json +31 -0
  5. checkpoint-6700/checkpoint-6000/adapter_model.safetensors +3 -0
  6. checkpoint-6700/checkpoint-6100/adapter_config.json +31 -0
  7. checkpoint-6700/checkpoint-6100/adapter_model.safetensors +3 -0
  8. checkpoint-6700/optimizer.pt +3 -0
  9. checkpoint-6700/rng_state.pth +3 -0
  10. checkpoint-6700/scheduler.pt +3 -0
  11. checkpoint-6700/special_tokens_map.json +24 -0
  12. checkpoint-6700/tokenizer.model +3 -0
  13. checkpoint-6700/tokenizer_config.json +44 -0
  14. checkpoint-6700/trainer_state.json +679 -0
  15. checkpoint-6700/training_args.bin +3 -0
  16. checkpoint-6800/README.md +202 -0
  17. checkpoint-6800/adapter_config.json +31 -0
  18. checkpoint-6800/adapter_model.safetensors +3 -0
  19. checkpoint-6800/checkpoint-6000/adapter_config.json +31 -0
  20. checkpoint-6800/checkpoint-6000/adapter_model.safetensors +3 -0
  21. checkpoint-6800/checkpoint-6100/adapter_config.json +31 -0
  22. checkpoint-6800/checkpoint-6100/adapter_model.safetensors +3 -0
  23. checkpoint-6800/optimizer.pt +3 -0
  24. checkpoint-6800/rng_state.pth +3 -0
  25. checkpoint-6800/scheduler.pt +3 -0
  26. checkpoint-6800/special_tokens_map.json +24 -0
  27. checkpoint-6800/tokenizer.model +3 -0
  28. checkpoint-6800/tokenizer_config.json +44 -0
  29. checkpoint-6800/trainer_state.json +686 -0
  30. checkpoint-6800/training_args.bin +3 -0
  31. checkpoint-6900/README.md +202 -0
  32. checkpoint-6900/adapter_config.json +31 -0
  33. checkpoint-6900/adapter_model.safetensors +3 -0
  34. checkpoint-6900/checkpoint-6000/adapter_config.json +31 -0
  35. checkpoint-6900/checkpoint-6000/adapter_model.safetensors +3 -0
  36. checkpoint-6900/checkpoint-6100/adapter_config.json +31 -0
  37. checkpoint-6900/checkpoint-6100/adapter_model.safetensors +3 -0
  38. checkpoint-6900/optimizer.pt +3 -0
  39. checkpoint-6900/rng_state.pth +3 -0
  40. checkpoint-6900/scheduler.pt +3 -0
  41. checkpoint-6900/special_tokens_map.json +24 -0
  42. checkpoint-6900/tokenizer.model +3 -0
  43. checkpoint-6900/tokenizer_config.json +44 -0
  44. checkpoint-6900/trainer_state.json +693 -0
  45. checkpoint-6900/training_args.bin +3 -0
checkpoint-6700/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Dorn4449/CyberSentinel-Mistral-7B-v3.8
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.11.1
checkpoint-6700/adapter_config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "Dorn4449/CyberSentinel-Mistral-7B-v3.8",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 8,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "o_proj",
24
+ "v_proj",
25
+ "k_proj",
26
+ "q_proj"
27
+ ],
28
+ "task_type": "CAUSAL_LM",
29
+ "use_dora": false,
30
+ "use_rslora": false
31
+ }
checkpoint-6700/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a31b1767dedc516354c90ae65c6652b4b33f3086a42e32da86b4aa9d997f3615
3
+ size 27297032
checkpoint-6700/checkpoint-6000/adapter_config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "Dorn4449/CyberSentinel-Mistral-7B-v3.8",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 8,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "o_proj",
24
+ "v_proj",
25
+ "k_proj",
26
+ "q_proj"
27
+ ],
28
+ "task_type": "CAUSAL_LM",
29
+ "use_dora": false,
30
+ "use_rslora": false
31
+ }
checkpoint-6700/checkpoint-6000/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:286625e19ea92f6bcbc715bf5d40bac2ac85da8b9ecb31679d50c38cb4b4b694
3
+ size 27297032
checkpoint-6700/checkpoint-6100/adapter_config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "Dorn4449/CyberSentinel-Mistral-7B-v3.8",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 8,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "o_proj",
24
+ "v_proj",
25
+ "k_proj",
26
+ "q_proj"
27
+ ],
28
+ "task_type": "CAUSAL_LM",
29
+ "use_dora": false,
30
+ "use_rslora": false
31
+ }
checkpoint-6700/checkpoint-6100/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0a6ee2942c85113263a2331006fa30db088217df193b0fca3de2eebe282399b
3
+ size 27297032
checkpoint-6700/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71c4e836dc5556b91ee2c11ccbe88fce50624eebd3fa0004f286582515d980b2
3
+ size 54744314
checkpoint-6700/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be66c1af761364c52f99e5a1734b6bc5a54ba1e66ffb256f82e96523353ef894
3
+ size 14244
checkpoint-6700/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc6199eef99b9032101aaf24b46c5cbd4bbd3bcbedab26fe5c2688d336199299
3
+ size 1064
checkpoint-6700/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
checkpoint-6700/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
3
+ size 493443
checkpoint-6700/tokenizer_config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "additional_special_tokens": [],
32
+ "bos_token": "<s>",
33
+ "chat_template": "{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content'] %}\n {%- set loop_messages = messages[1:] %}\n{%- else %}\n {%- set loop_messages = messages %}\n{%- endif %}\n\n{{- bos_token }}\n{%- for message in loop_messages %}\n {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}\n {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}\n {%- endif %}\n {%- if message['role'] == 'user' %}\n {%- if loop.first and system_message is defined %}\n {{- ' [INST] ' + system_message + '\\n\\n' + message['content'] + ' [/INST]' }}\n {%- else %}\n {{- ' [INST] ' + message['content'] + ' [/INST]' }}\n {%- endif %}\n {%- elif message['role'] == 'assistant' %}\n {{- ' ' + message['content'] + eos_token}}\n {%- else %}\n {{- raise_exception('Only user and assistant roles are supported, with the exception of an initial optional system message!') }}\n {%- endif %}\n{%- endfor %}\n",
34
+ "clean_up_tokenization_spaces": false,
35
+ "eos_token": "</s>",
36
+ "legacy": false,
37
+ "model_max_length": 1000000000000000019884624838656,
38
+ "pad_token": "</s>",
39
+ "sp_model_kwargs": {},
40
+ "spaces_between_special_tokens": false,
41
+ "tokenizer_class": "LlamaTokenizer",
42
+ "unk_token": "<unk>",
43
+ "use_default_system_prompt": false
44
+ }
checkpoint-6700/trainer_state.json ADDED
@@ -0,0 +1,679 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.17866666666666667,
5
+ "eval_steps": 500,
6
+ "global_step": 6700,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "entropy": 1.084044404476881,
13
+ "epoch": 0.010666666666666666,
14
+ "grad_norm": 0.796875,
15
+ "learning_rate": 7.92e-07,
16
+ "loss": 1.3782466125488282,
17
+ "mean_token_accuracy": 0.7083857330679894,
18
+ "num_tokens": 986491.0,
19
+ "step": 100
20
+ },
21
+ {
22
+ "entropy": 1.1007767178118228,
23
+ "epoch": 0.021333333333333333,
24
+ "grad_norm": 0.3125,
25
+ "learning_rate": 1.592e-06,
26
+ "loss": 1.2902149963378906,
27
+ "mean_token_accuracy": 0.7171274860203266,
28
+ "num_tokens": 1952567.0,
29
+ "step": 200
30
+ },
31
+ {
32
+ "entropy": 1.0802835547924041,
33
+ "epoch": 0.032,
34
+ "grad_norm": 0.28515625,
35
+ "learning_rate": 2.392e-06,
36
+ "loss": 1.170371322631836,
37
+ "mean_token_accuracy": 0.7354862231016159,
38
+ "num_tokens": 2935479.0,
39
+ "step": 300
40
+ },
41
+ {
42
+ "entropy": 1.0573877203464508,
43
+ "epoch": 0.042666666666666665,
44
+ "grad_norm": 0.23046875,
45
+ "learning_rate": 3.192e-06,
46
+ "loss": 1.062843780517578,
47
+ "mean_token_accuracy": 0.7536254300177098,
48
+ "num_tokens": 3920344.0,
49
+ "step": 400
50
+ },
51
+ {
52
+ "entropy": 1.0149462178349495,
53
+ "epoch": 0.05333333333333334,
54
+ "grad_norm": 0.212890625,
55
+ "learning_rate": 3.992e-06,
56
+ "loss": 0.9590372467041015,
57
+ "mean_token_accuracy": 0.7738636130094528,
58
+ "num_tokens": 4899217.0,
59
+ "step": 500
60
+ },
61
+ {
62
+ "entropy": 0.8931056271493435,
63
+ "epoch": 0.064,
64
+ "grad_norm": 0.28125,
65
+ "learning_rate": 4.792e-06,
66
+ "loss": 0.7998863983154297,
67
+ "mean_token_accuracy": 0.8012332341074944,
68
+ "num_tokens": 5883391.0,
69
+ "step": 600
70
+ },
71
+ {
72
+ "entropy": 0.6136953190714121,
73
+ "epoch": 0.07466666666666667,
74
+ "grad_norm": 1.1015625,
75
+ "learning_rate": 5.592000000000001e-06,
76
+ "loss": 0.5131131362915039,
77
+ "mean_token_accuracy": 0.866335421949625,
78
+ "num_tokens": 6857125.0,
79
+ "step": 700
80
+ },
81
+ {
82
+ "entropy": 0.2726459547691047,
83
+ "epoch": 0.08533333333333333,
84
+ "grad_norm": 0.474609375,
85
+ "learning_rate": 6.392e-06,
86
+ "loss": 0.18439870834350586,
87
+ "mean_token_accuracy": 0.9579810079932213,
88
+ "num_tokens": 7831290.0,
89
+ "step": 800
90
+ },
91
+ {
92
+ "entropy": 0.11284050881862641,
93
+ "epoch": 0.096,
94
+ "grad_norm": 0.1875,
95
+ "learning_rate": 7.192e-06,
96
+ "loss": 0.05271556854248047,
97
+ "mean_token_accuracy": 0.9904264670610428,
98
+ "num_tokens": 8815417.0,
99
+ "step": 900
100
+ },
101
+ {
102
+ "entropy": 0.068436967888847,
103
+ "epoch": 0.10666666666666667,
104
+ "grad_norm": 0.125,
105
+ "learning_rate": 7.992e-06,
106
+ "loss": 0.02549468755722046,
107
+ "mean_token_accuracy": 0.9951217715442181,
108
+ "num_tokens": 9782033.0,
109
+ "step": 1000
110
+ },
111
+ {
112
+ "entropy": 0.052372096767649055,
113
+ "epoch": 0.11733333333333333,
114
+ "grad_norm": 0.1318359375,
115
+ "learning_rate": 8.792e-06,
116
+ "loss": 0.017259199619293213,
117
+ "mean_token_accuracy": 0.9962555834650993,
118
+ "num_tokens": 10759962.0,
119
+ "step": 1100
120
+ },
121
+ {
122
+ "entropy": 0.03862797610927373,
123
+ "epoch": 0.128,
124
+ "grad_norm": 0.0867946669459343,
125
+ "learning_rate": 9.591999999999999e-06,
126
+ "loss": 0.013324768543243408,
127
+ "mean_token_accuracy": 0.9968111206591129,
128
+ "num_tokens": 987986.0,
129
+ "step": 1200
130
+ },
131
+ {
132
+ "entropy": 0.032093781144358215,
133
+ "epoch": 0.13866666666666666,
134
+ "grad_norm": 0.037592533975839615,
135
+ "learning_rate": 1.0392e-05,
136
+ "loss": 0.013392001390457153,
137
+ "mean_token_accuracy": 0.9965760576725006,
138
+ "num_tokens": 1968127.0,
139
+ "step": 1300
140
+ },
141
+ {
142
+ "entropy": 0.031904329673852774,
143
+ "epoch": 0.14933333333333335,
144
+ "grad_norm": 0.10515860468149185,
145
+ "learning_rate": 1.1192e-05,
146
+ "loss": 0.016373103857040404,
147
+ "mean_token_accuracy": 0.9955054900050163,
148
+ "num_tokens": 2965677.0,
149
+ "step": 1400
150
+ },
151
+ {
152
+ "entropy": 0.0265167937008664,
153
+ "epoch": 0.16,
154
+ "grad_norm": 0.12689532339572906,
155
+ "learning_rate": 1.1992e-05,
156
+ "loss": 0.011554093360900878,
157
+ "mean_token_accuracy": 0.996508517563343,
158
+ "num_tokens": 3938144.0,
159
+ "step": 1500
160
+ },
161
+ {
162
+ "entropy": 0.024146563813555986,
163
+ "epoch": 0.17066666666666666,
164
+ "grad_norm": 0.03401608020067215,
165
+ "learning_rate": 1.2792e-05,
166
+ "loss": 0.010398292541503906,
167
+ "mean_token_accuracy": 0.9966818282008171,
168
+ "num_tokens": 4897019.0,
169
+ "step": 1600
170
+ },
171
+ {
172
+ "entropy": 0.025755486716516316,
173
+ "epoch": 0.18133333333333335,
174
+ "grad_norm": 0.05834396556019783,
175
+ "learning_rate": 1.3592000000000001e-05,
176
+ "loss": 0.012847075462341309,
177
+ "mean_token_accuracy": 0.9960671140253544,
178
+ "num_tokens": 5880125.0,
179
+ "step": 1700
180
+ },
181
+ {
182
+ "entropy": 0.023504739217460154,
183
+ "epoch": 0.192,
184
+ "grad_norm": 0.09117468446493149,
185
+ "learning_rate": 1.4392e-05,
186
+ "loss": 0.011869451999664306,
187
+ "mean_token_accuracy": 0.9964955732226372,
188
+ "num_tokens": 6867360.0,
189
+ "step": 1800
190
+ },
191
+ {
192
+ "entropy": 0.022944011739455164,
193
+ "epoch": 0.20266666666666666,
194
+ "grad_norm": 0.046192608773708344,
195
+ "learning_rate": 1.4978666666666668e-05,
196
+ "loss": 0.010620262622833252,
197
+ "mean_token_accuracy": 0.9964672869443894,
198
+ "num_tokens": 7840472.0,
199
+ "step": 1900
200
+ },
201
+ {
202
+ "entropy": 0.018894561287015676,
203
+ "epoch": 0.21333333333333335,
204
+ "grad_norm": 0.02761668898165226,
205
+ "learning_rate": 1.4889777777777778e-05,
206
+ "loss": 0.0067046540975570675,
207
+ "mean_token_accuracy": 0.9975774252414703,
208
+ "num_tokens": 8827474.0,
209
+ "step": 2000
210
+ },
211
+ {
212
+ "entropy": 0.020934174589347095,
213
+ "epoch": 0.224,
214
+ "grad_norm": 0.02931295707821846,
215
+ "learning_rate": 1.4800888888888889e-05,
216
+ "loss": 0.010140993595123292,
217
+ "mean_token_accuracy": 0.996714953482151,
218
+ "num_tokens": 9808475.0,
219
+ "step": 2100
220
+ },
221
+ {
222
+ "entropy": 0.020719884738791734,
223
+ "epoch": 0.23466666666666666,
224
+ "grad_norm": 0.02652113325893879,
225
+ "learning_rate": 1.4712e-05,
226
+ "loss": 0.010557392835617066,
227
+ "mean_token_accuracy": 0.9969246552884579,
228
+ "num_tokens": 10782725.0,
229
+ "step": 2200
230
+ },
231
+ {
232
+ "entropy": 0.020559412932489068,
233
+ "epoch": 0.24533333333333332,
234
+ "grad_norm": 0.032906673848629,
235
+ "learning_rate": 1.4623111111111113e-05,
236
+ "loss": 0.009789772033691406,
237
+ "mean_token_accuracy": 0.9966975942254066,
238
+ "num_tokens": 11769893.0,
239
+ "step": 2300
240
+ },
241
+ {
242
+ "entropy": 0.018388252432923764,
243
+ "epoch": 0.256,
244
+ "grad_norm": 0.08162333816289902,
245
+ "learning_rate": 1.4534222222222222e-05,
246
+ "loss": 0.007909480929374695,
247
+ "mean_token_accuracy": 0.9972486282885075,
248
+ "num_tokens": 12753597.0,
249
+ "step": 2400
250
+ },
251
+ {
252
+ "entropy": 0.02204789153067395,
253
+ "epoch": 0.26666666666666666,
254
+ "grad_norm": 0.023511990904808044,
255
+ "learning_rate": 1.4445333333333334e-05,
256
+ "loss": 0.011945382356643677,
257
+ "mean_token_accuracy": 0.9963182592391968,
258
+ "num_tokens": 13739044.0,
259
+ "step": 2500
260
+ },
261
+ {
262
+ "entropy": 0.02037038065260276,
263
+ "epoch": 0.2773333333333333,
264
+ "grad_norm": 0.02014540508389473,
265
+ "learning_rate": 1.4356444444444446e-05,
266
+ "loss": 0.010165022611618042,
267
+ "mean_token_accuracy": 0.9967658732831478,
268
+ "num_tokens": 14709501.0,
269
+ "step": 2600
270
+ },
271
+ {
272
+ "entropy": 0.01947357293218374,
273
+ "epoch": 0.288,
274
+ "grad_norm": 0.01829116977751255,
275
+ "learning_rate": 1.4267555555555555e-05,
276
+ "loss": 0.009100326895713806,
277
+ "mean_token_accuracy": 0.9969173397123814,
278
+ "num_tokens": 15693309.0,
279
+ "step": 2700
280
+ },
281
+ {
282
+ "entropy": 0.021565243480727078,
283
+ "epoch": 0.2986666666666667,
284
+ "grad_norm": 0.031732361763715744,
285
+ "learning_rate": 1.4178666666666667e-05,
286
+ "loss": 0.01107092022895813,
287
+ "mean_token_accuracy": 0.9963353677093982,
288
+ "num_tokens": 985520.0,
289
+ "step": 2800
290
+ },
291
+ {
292
+ "entropy": 0.02136099517461844,
293
+ "epoch": 0.30933333333333335,
294
+ "grad_norm": 0.03870174661278725,
295
+ "learning_rate": 1.4089777777777779e-05,
296
+ "loss": 0.011569523811340332,
297
+ "mean_token_accuracy": 0.9964252272248268,
298
+ "num_tokens": 1969572.0,
299
+ "step": 2900
300
+ },
301
+ {
302
+ "entropy": 0.019074252294376492,
303
+ "epoch": 0.32,
304
+ "grad_norm": 0.01954864338040352,
305
+ "learning_rate": 1.4000888888888888e-05,
306
+ "loss": 0.008727578520774841,
307
+ "mean_token_accuracy": 0.9970971086621284,
308
+ "num_tokens": 2945404.0,
309
+ "step": 3000
310
+ },
311
+ {
312
+ "entropy": 0.018130726229865102,
313
+ "epoch": 0.33066666666666666,
314
+ "grad_norm": 0.01979902759194374,
315
+ "learning_rate": 1.3912e-05,
316
+ "loss": 0.007898266315460206,
317
+ "mean_token_accuracy": 0.9971335357427598,
318
+ "num_tokens": 3925122.0,
319
+ "step": 3100
320
+ },
321
+ {
322
+ "entropy": 0.019400757937546587,
323
+ "epoch": 0.3413333333333333,
324
+ "grad_norm": 0.030273959040641785,
325
+ "learning_rate": 1.0980206234443522e-05,
326
+ "loss": 0.009979066749413809,
327
+ "mean_token_accuracy": 0.9968534293584526,
328
+ "num_tokens": 945781.0,
329
+ "step": 3200
330
+ },
331
+ {
332
+ "entropy": 0.021496493350714446,
333
+ "epoch": 0.352,
334
+ "grad_norm": 0.023218955844640732,
335
+ "learning_rate": 1.0802417921061989e-05,
336
+ "loss": 0.011733214855194091,
337
+ "mean_token_accuracy": 0.996385814100504,
338
+ "num_tokens": 1919451.0,
339
+ "step": 3300
340
+ },
341
+ {
342
+ "entropy": 0.01932728004641831,
343
+ "epoch": 0.3626666666666667,
344
+ "grad_norm": 0.014179096557199955,
345
+ "learning_rate": 1.0624629607680455e-05,
346
+ "loss": 0.00976854920387268,
347
+ "mean_token_accuracy": 0.9968989025056362,
348
+ "num_tokens": 2892220.0,
349
+ "step": 3400
350
+ },
351
+ {
352
+ "entropy": 0.02088767145993188,
353
+ "epoch": 0.37333333333333335,
354
+ "grad_norm": 0.032297272235155106,
355
+ "learning_rate": 1.0446841294298921e-05,
356
+ "loss": 0.011026575565338134,
357
+ "mean_token_accuracy": 0.9964779444038868,
358
+ "num_tokens": 3878184.0,
359
+ "step": 3500
360
+ },
361
+ {
362
+ "entropy": 0.018377634914308463,
363
+ "epoch": 0.384,
364
+ "grad_norm": 0.012006225995719433,
365
+ "learning_rate": 1.3467555555555556e-05,
366
+ "loss": 0.008617221661235975,
367
+ "mean_token_accuracy": 0.9970365500320559,
368
+ "num_tokens": 904843.0,
369
+ "step": 3600
370
+ },
371
+ {
372
+ "entropy": 0.016050403744447977,
373
+ "epoch": 0.39466666666666667,
374
+ "grad_norm": 0.011128585785627365,
375
+ "learning_rate": 1.3378666666666666e-05,
376
+ "loss": 0.006250782608985901,
377
+ "mean_token_accuracy": 0.9976212471723557,
378
+ "num_tokens": 1878860.0,
379
+ "step": 3700
380
+ },
381
+ {
382
+ "entropy": 0.01809800002258271,
383
+ "epoch": 0.4053333333333333,
384
+ "grad_norm": 0.01952100545167923,
385
+ "learning_rate": 1.3289777777777778e-05,
386
+ "loss": 0.008064679503440857,
387
+ "mean_token_accuracy": 0.9971583542227745,
388
+ "num_tokens": 2860744.0,
389
+ "step": 3800
390
+ },
391
+ {
392
+ "entropy": 0.019823117861524225,
393
+ "epoch": 0.416,
394
+ "grad_norm": 0.011735321022570133,
395
+ "learning_rate": 1.3200888888888889e-05,
396
+ "loss": 0.010386246442794799,
397
+ "mean_token_accuracy": 0.9966941741108895,
398
+ "num_tokens": 3831565.0,
399
+ "step": 3900
400
+ },
401
+ {
402
+ "entropy": 0.02049923066298889,
403
+ "epoch": 0.4266666666666667,
404
+ "grad_norm": 0.006284466944634914,
405
+ "learning_rate": 1.3112e-05,
406
+ "loss": 0.01045264061107192,
407
+ "mean_token_accuracy": 0.9964593903616418,
408
+ "num_tokens": 849872.0,
409
+ "step": 4000
410
+ },
411
+ {
412
+ "entropy": 0.017120514765847476,
413
+ "epoch": 0.43733333333333335,
414
+ "grad_norm": 0.010025433264672756,
415
+ "learning_rate": 1.3023111111111111e-05,
416
+ "loss": 0.007535084486007691,
417
+ "mean_token_accuracy": 0.9973040929436684,
418
+ "num_tokens": 1826365.0,
419
+ "step": 4100
420
+ },
421
+ {
422
+ "entropy": 0.01937343619065359,
423
+ "epoch": 0.448,
424
+ "grad_norm": 0.008356385864317417,
425
+ "learning_rate": 1.2934222222222222e-05,
426
+ "loss": 0.010190980434417725,
427
+ "mean_token_accuracy": 0.9967715987563133,
428
+ "num_tokens": 2802749.0,
429
+ "step": 4200
430
+ },
431
+ {
432
+ "entropy": 0.017232202125014737,
433
+ "epoch": 0.45866666666666667,
434
+ "grad_norm": 0.011389357037842274,
435
+ "learning_rate": 1.2845333333333334e-05,
436
+ "loss": 0.007943087816238403,
437
+ "mean_token_accuracy": 0.9973043432831764,
438
+ "num_tokens": 3802758.0,
439
+ "step": 4300
440
+ },
441
+ {
442
+ "entropy": 0.022822092252748984,
443
+ "epoch": 0.4693333333333333,
444
+ "grad_norm": 0.017312563955783844,
445
+ "learning_rate": 1.2756444444444444e-05,
446
+ "loss": 0.012838510819423346,
447
+ "mean_token_accuracy": 0.9959132893953795,
448
+ "num_tokens": 789736.0,
449
+ "step": 4400
450
+ },
451
+ {
452
+ "entropy": 0.0184966369275935,
453
+ "epoch": 0.48,
454
+ "grad_norm": 0.04721185564994812,
455
+ "learning_rate": 1.2667555555555557e-05,
456
+ "loss": 0.009318522214889526,
457
+ "mean_token_accuracy": 0.9968563948571681,
458
+ "num_tokens": 1767600.0,
459
+ "step": 4500
460
+ },
461
+ {
462
+ "entropy": 0.021548866296652706,
463
+ "epoch": 0.49066666666666664,
464
+ "grad_norm": 0.9198243021965027,
465
+ "learning_rate": 1.2578666666666667e-05,
466
+ "loss": 0.013902791738510133,
467
+ "mean_token_accuracy": 0.9959381237626076,
468
+ "num_tokens": 2754982.0,
469
+ "step": 4600
470
+ },
471
+ {
472
+ "entropy": 0.0234364516264759,
473
+ "epoch": 0.5013333333333333,
474
+ "grad_norm": 0.011652004905045033,
475
+ "learning_rate": 1.2489777777777779e-05,
476
+ "loss": 0.011085785627365112,
477
+ "mean_token_accuracy": 0.9966410009562969,
478
+ "num_tokens": 3730109.0,
479
+ "step": 4700
480
+ },
481
+ {
482
+ "entropy": 0.017517962178529856,
483
+ "epoch": 0.512,
484
+ "grad_norm": 0.008591280318796635,
485
+ "learning_rate": 1.240088888888889e-05,
486
+ "loss": 0.0068675024168831965,
487
+ "mean_token_accuracy": 0.9973819294533173,
488
+ "num_tokens": 753786.0,
489
+ "step": 4800
490
+ },
491
+ {
492
+ "entropy": 0.019476991441333667,
493
+ "epoch": 0.5226666666666666,
494
+ "grad_norm": 0.013633953407406807,
495
+ "learning_rate": 1.2312e-05,
496
+ "loss": 0.009302983283996582,
497
+ "mean_token_accuracy": 0.9968732745945453,
498
+ "num_tokens": 1732738.0,
499
+ "step": 4900
500
+ },
501
+ {
502
+ "entropy": 0.020917424112558366,
503
+ "epoch": 0.5333333333333333,
504
+ "grad_norm": 0.01434489618986845,
505
+ "learning_rate": 1.2223111111111112e-05,
506
+ "loss": 0.010165597200393678,
507
+ "mean_token_accuracy": 0.9965643344819546,
508
+ "num_tokens": 2702069.0,
509
+ "step": 5000
510
+ },
511
+ {
512
+ "entropy": 0.017627096675569193,
513
+ "epoch": 0.544,
514
+ "grad_norm": 0.010028124786913395,
515
+ "learning_rate": 1.2134222222222223e-05,
516
+ "loss": 0.008581981062889099,
517
+ "mean_token_accuracy": 0.9972231885790825,
518
+ "num_tokens": 3692514.0,
519
+ "step": 5100
520
+ },
521
+ {
522
+ "entropy": 0.018209505494293832,
523
+ "epoch": 0.5546666666666666,
524
+ "grad_norm": 0.010386968962848186,
525
+ "learning_rate": 1.2045333333333333e-05,
526
+ "loss": 0.00814423689971099,
527
+ "mean_token_accuracy": 0.9972281313023051,
528
+ "num_tokens": 739184.0,
529
+ "step": 5200
530
+ },
531
+ {
532
+ "entropy": 0.02268622429575771,
533
+ "epoch": 0.5653333333333334,
534
+ "grad_norm": 0.01215888187289238,
535
+ "learning_rate": 1.1956444444444445e-05,
536
+ "loss": 0.012614715099334716,
537
+ "mean_token_accuracy": 0.9960222035646439,
538
+ "num_tokens": 1710404.0,
539
+ "step": 5300
540
+ },
541
+ {
542
+ "entropy": 0.020103816259652376,
543
+ "epoch": 0.576,
544
+ "grad_norm": 0.014174265787005424,
545
+ "learning_rate": 1.1867555555555556e-05,
546
+ "loss": 0.010200117826461791,
547
+ "mean_token_accuracy": 0.9967270520329475,
548
+ "num_tokens": 2686777.0,
549
+ "step": 5400
550
+ },
551
+ {
552
+ "entropy": 0.020467385197989643,
553
+ "epoch": 0.5866666666666667,
554
+ "grad_norm": 0.014257642440497875,
555
+ "learning_rate": 1.1778666666666666e-05,
556
+ "loss": 0.010185201168060303,
557
+ "mean_token_accuracy": 0.9966062535345555,
558
+ "num_tokens": 3655445.0,
559
+ "step": 5500
560
+ },
561
+ {
562
+ "entropy": 0.020591240752474878,
563
+ "epoch": 0.5973333333333334,
564
+ "grad_norm": 0.011690130457282066,
565
+ "learning_rate": 1.1689777777777778e-05,
566
+ "loss": 0.010783259419427402,
567
+ "mean_token_accuracy": 0.9964483192433482,
568
+ "num_tokens": 686164.0,
569
+ "step": 5600
570
+ },
571
+ {
572
+ "entropy": 0.0205141630244907,
573
+ "epoch": 0.608,
574
+ "grad_norm": 0.006469405256211758,
575
+ "learning_rate": 1.160088888888889e-05,
576
+ "loss": 0.010492314100265503,
577
+ "mean_token_accuracy": 0.9966046234965324,
578
+ "num_tokens": 1662167.0,
579
+ "step": 5700
580
+ },
581
+ {
582
+ "entropy": 0.020147288321750237,
583
+ "epoch": 0.6186666666666667,
584
+ "grad_norm": 0.01614871807396412,
585
+ "learning_rate": 1.1512e-05,
586
+ "loss": 0.010045292377471924,
587
+ "mean_token_accuracy": 0.9966985350847244,
588
+ "num_tokens": 2643221.0,
589
+ "step": 5800
590
+ },
591
+ {
592
+ "entropy": 0.01912423676229082,
593
+ "epoch": 0.6293333333333333,
594
+ "grad_norm": 0.010019957087934017,
595
+ "learning_rate": 1.1423111111111111e-05,
596
+ "loss": 0.009680591821670533,
597
+ "mean_token_accuracy": 0.9968334528803825,
598
+ "num_tokens": 3611899.0,
599
+ "step": 5900
600
+ },
601
+ {
602
+ "epoch": 0.16,
603
+ "grad_norm": 1.419013500213623,
604
+ "learning_rate": 1.2e-05,
605
+ "loss": 0.0443,
606
+ "step": 6000
607
+ },
608
+ {
609
+ "epoch": 0.16266666666666665,
610
+ "grad_norm": 0.0630747601389885,
611
+ "learning_rate": 1.22e-05,
612
+ "loss": 0.0066,
613
+ "step": 6100
614
+ },
615
+ {
616
+ "epoch": 0.16533333333333333,
617
+ "grad_norm": 0.08701734989881516,
618
+ "learning_rate": 1.24e-05,
619
+ "loss": 0.0171,
620
+ "step": 6200
621
+ },
622
+ {
623
+ "epoch": 0.168,
624
+ "grad_norm": 0.7274932861328125,
625
+ "learning_rate": 1.26e-05,
626
+ "loss": 0.0065,
627
+ "step": 6300
628
+ },
629
+ {
630
+ "epoch": 0.17066666666666666,
631
+ "grad_norm": 0.07388290017843246,
632
+ "learning_rate": 1.2800000000000001e-05,
633
+ "loss": 0.0087,
634
+ "step": 6400
635
+ },
636
+ {
637
+ "epoch": 0.17333333333333334,
638
+ "grad_norm": 0.045733992010354996,
639
+ "learning_rate": 1.3000000000000001e-05,
640
+ "loss": 0.0114,
641
+ "step": 6500
642
+ },
643
+ {
644
+ "epoch": 0.176,
645
+ "grad_norm": 0.4374091625213623,
646
+ "learning_rate": 1.32e-05,
647
+ "loss": 0.0139,
648
+ "step": 6600
649
+ },
650
+ {
651
+ "epoch": 0.17866666666666667,
652
+ "grad_norm": 0.050165899097919464,
653
+ "learning_rate": 1.34e-05,
654
+ "loss": 0.0145,
655
+ "step": 6700
656
+ }
657
+ ],
658
+ "logging_steps": 100,
659
+ "max_steps": 75000,
660
+ "num_input_tokens_seen": 0,
661
+ "num_train_epochs": 2,
662
+ "save_steps": 100,
663
+ "stateful_callbacks": {
664
+ "TrainerControl": {
665
+ "args": {
666
+ "should_epoch_stop": false,
667
+ "should_evaluate": false,
668
+ "should_log": false,
669
+ "should_save": true,
670
+ "should_training_stop": false
671
+ },
672
+ "attributes": {}
673
+ }
674
+ },
675
+ "total_flos": 3.530244817910268e+18,
676
+ "train_batch_size": 4,
677
+ "trial_name": null,
678
+ "trial_params": null
679
+ }
checkpoint-6700/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c40a4c02bff4c325a39b29bbad54a640a82b1bc361c29cc2656ac8d29cf43eaa
3
+ size 5432
checkpoint-6800/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Dorn4449/CyberSentinel-Mistral-7B-v3.8
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.11.1
checkpoint-6800/adapter_config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "Dorn4449/CyberSentinel-Mistral-7B-v3.8",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 8,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "o_proj",
24
+ "v_proj",
25
+ "k_proj",
26
+ "q_proj"
27
+ ],
28
+ "task_type": "CAUSAL_LM",
29
+ "use_dora": false,
30
+ "use_rslora": false
31
+ }
checkpoint-6800/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0507d10eae1f0346bd6339d599120583923b5ce79cf6cee34102ef3cee8c9d98
3
+ size 27297032
checkpoint-6800/checkpoint-6000/adapter_config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "Dorn4449/CyberSentinel-Mistral-7B-v3.8",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 8,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "o_proj",
24
+ "v_proj",
25
+ "k_proj",
26
+ "q_proj"
27
+ ],
28
+ "task_type": "CAUSAL_LM",
29
+ "use_dora": false,
30
+ "use_rslora": false
31
+ }
checkpoint-6800/checkpoint-6000/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:286625e19ea92f6bcbc715bf5d40bac2ac85da8b9ecb31679d50c38cb4b4b694
3
+ size 27297032
checkpoint-6800/checkpoint-6100/adapter_config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "Dorn4449/CyberSentinel-Mistral-7B-v3.8",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 8,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "o_proj",
24
+ "v_proj",
25
+ "k_proj",
26
+ "q_proj"
27
+ ],
28
+ "task_type": "CAUSAL_LM",
29
+ "use_dora": false,
30
+ "use_rslora": false
31
+ }
checkpoint-6800/checkpoint-6100/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0a6ee2942c85113263a2331006fa30db088217df193b0fca3de2eebe282399b
3
+ size 27297032
checkpoint-6800/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:99af4a4481c6650369d5e07dbe438199eb0c51fb7785d01f896dc75efa81c1cb
3
+ size 54744314
checkpoint-6800/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ce03ec18080cb529d92960e443bce2c42c82d7d8e70eff7f20951893977d067
3
+ size 14244
checkpoint-6800/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82cccbba262cf26956ffd28c659fa3c037c0ffdf3b06154be11735905ca57f70
3
+ size 1064
checkpoint-6800/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
checkpoint-6800/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
3
+ size 493443
checkpoint-6800/tokenizer_config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "additional_special_tokens": [],
32
+ "bos_token": "<s>",
33
+ "chat_template": "{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content'] %}\n {%- set loop_messages = messages[1:] %}\n{%- else %}\n {%- set loop_messages = messages %}\n{%- endif %}\n\n{{- bos_token }}\n{%- for message in loop_messages %}\n {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}\n {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}\n {%- endif %}\n {%- if message['role'] == 'user' %}\n {%- if loop.first and system_message is defined %}\n {{- ' [INST] ' + system_message + '\\n\\n' + message['content'] + ' [/INST]' }}\n {%- else %}\n {{- ' [INST] ' + message['content'] + ' [/INST]' }}\n {%- endif %}\n {%- elif message['role'] == 'assistant' %}\n {{- ' ' + message['content'] + eos_token}}\n {%- else %}\n {{- raise_exception('Only user and assistant roles are supported, with the exception of an initial optional system message!') }}\n {%- endif %}\n{%- endfor %}\n",
34
+ "clean_up_tokenization_spaces": false,
35
+ "eos_token": "</s>",
36
+ "legacy": false,
37
+ "model_max_length": 1000000000000000019884624838656,
38
+ "pad_token": "</s>",
39
+ "sp_model_kwargs": {},
40
+ "spaces_between_special_tokens": false,
41
+ "tokenizer_class": "LlamaTokenizer",
42
+ "unk_token": "<unk>",
43
+ "use_default_system_prompt": false
44
+ }
checkpoint-6800/trainer_state.json ADDED
@@ -0,0 +1,686 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.18133333333333335,
5
+ "eval_steps": 500,
6
+ "global_step": 6800,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "entropy": 1.084044404476881,
13
+ "epoch": 0.010666666666666666,
14
+ "grad_norm": 0.796875,
15
+ "learning_rate": 7.92e-07,
16
+ "loss": 1.3782466125488282,
17
+ "mean_token_accuracy": 0.7083857330679894,
18
+ "num_tokens": 986491.0,
19
+ "step": 100
20
+ },
21
+ {
22
+ "entropy": 1.1007767178118228,
23
+ "epoch": 0.021333333333333333,
24
+ "grad_norm": 0.3125,
25
+ "learning_rate": 1.592e-06,
26
+ "loss": 1.2902149963378906,
27
+ "mean_token_accuracy": 0.7171274860203266,
28
+ "num_tokens": 1952567.0,
29
+ "step": 200
30
+ },
31
+ {
32
+ "entropy": 1.0802835547924041,
33
+ "epoch": 0.032,
34
+ "grad_norm": 0.28515625,
35
+ "learning_rate": 2.392e-06,
36
+ "loss": 1.170371322631836,
37
+ "mean_token_accuracy": 0.7354862231016159,
38
+ "num_tokens": 2935479.0,
39
+ "step": 300
40
+ },
41
+ {
42
+ "entropy": 1.0573877203464508,
43
+ "epoch": 0.042666666666666665,
44
+ "grad_norm": 0.23046875,
45
+ "learning_rate": 3.192e-06,
46
+ "loss": 1.062843780517578,
47
+ "mean_token_accuracy": 0.7536254300177098,
48
+ "num_tokens": 3920344.0,
49
+ "step": 400
50
+ },
51
+ {
52
+ "entropy": 1.0149462178349495,
53
+ "epoch": 0.05333333333333334,
54
+ "grad_norm": 0.212890625,
55
+ "learning_rate": 3.992e-06,
56
+ "loss": 0.9590372467041015,
57
+ "mean_token_accuracy": 0.7738636130094528,
58
+ "num_tokens": 4899217.0,
59
+ "step": 500
60
+ },
61
+ {
62
+ "entropy": 0.8931056271493435,
63
+ "epoch": 0.064,
64
+ "grad_norm": 0.28125,
65
+ "learning_rate": 4.792e-06,
66
+ "loss": 0.7998863983154297,
67
+ "mean_token_accuracy": 0.8012332341074944,
68
+ "num_tokens": 5883391.0,
69
+ "step": 600
70
+ },
71
+ {
72
+ "entropy": 0.6136953190714121,
73
+ "epoch": 0.07466666666666667,
74
+ "grad_norm": 1.1015625,
75
+ "learning_rate": 5.592000000000001e-06,
76
+ "loss": 0.5131131362915039,
77
+ "mean_token_accuracy": 0.866335421949625,
78
+ "num_tokens": 6857125.0,
79
+ "step": 700
80
+ },
81
+ {
82
+ "entropy": 0.2726459547691047,
83
+ "epoch": 0.08533333333333333,
84
+ "grad_norm": 0.474609375,
85
+ "learning_rate": 6.392e-06,
86
+ "loss": 0.18439870834350586,
87
+ "mean_token_accuracy": 0.9579810079932213,
88
+ "num_tokens": 7831290.0,
89
+ "step": 800
90
+ },
91
+ {
92
+ "entropy": 0.11284050881862641,
93
+ "epoch": 0.096,
94
+ "grad_norm": 0.1875,
95
+ "learning_rate": 7.192e-06,
96
+ "loss": 0.05271556854248047,
97
+ "mean_token_accuracy": 0.9904264670610428,
98
+ "num_tokens": 8815417.0,
99
+ "step": 900
100
+ },
101
+ {
102
+ "entropy": 0.068436967888847,
103
+ "epoch": 0.10666666666666667,
104
+ "grad_norm": 0.125,
105
+ "learning_rate": 7.992e-06,
106
+ "loss": 0.02549468755722046,
107
+ "mean_token_accuracy": 0.9951217715442181,
108
+ "num_tokens": 9782033.0,
109
+ "step": 1000
110
+ },
111
+ {
112
+ "entropy": 0.052372096767649055,
113
+ "epoch": 0.11733333333333333,
114
+ "grad_norm": 0.1318359375,
115
+ "learning_rate": 8.792e-06,
116
+ "loss": 0.017259199619293213,
117
+ "mean_token_accuracy": 0.9962555834650993,
118
+ "num_tokens": 10759962.0,
119
+ "step": 1100
120
+ },
121
+ {
122
+ "entropy": 0.03862797610927373,
123
+ "epoch": 0.128,
124
+ "grad_norm": 0.0867946669459343,
125
+ "learning_rate": 9.591999999999999e-06,
126
+ "loss": 0.013324768543243408,
127
+ "mean_token_accuracy": 0.9968111206591129,
128
+ "num_tokens": 987986.0,
129
+ "step": 1200
130
+ },
131
+ {
132
+ "entropy": 0.032093781144358215,
133
+ "epoch": 0.13866666666666666,
134
+ "grad_norm": 0.037592533975839615,
135
+ "learning_rate": 1.0392e-05,
136
+ "loss": 0.013392001390457153,
137
+ "mean_token_accuracy": 0.9965760576725006,
138
+ "num_tokens": 1968127.0,
139
+ "step": 1300
140
+ },
141
+ {
142
+ "entropy": 0.031904329673852774,
143
+ "epoch": 0.14933333333333335,
144
+ "grad_norm": 0.10515860468149185,
145
+ "learning_rate": 1.1192e-05,
146
+ "loss": 0.016373103857040404,
147
+ "mean_token_accuracy": 0.9955054900050163,
148
+ "num_tokens": 2965677.0,
149
+ "step": 1400
150
+ },
151
+ {
152
+ "entropy": 0.0265167937008664,
153
+ "epoch": 0.16,
154
+ "grad_norm": 0.12689532339572906,
155
+ "learning_rate": 1.1992e-05,
156
+ "loss": 0.011554093360900878,
157
+ "mean_token_accuracy": 0.996508517563343,
158
+ "num_tokens": 3938144.0,
159
+ "step": 1500
160
+ },
161
+ {
162
+ "entropy": 0.024146563813555986,
163
+ "epoch": 0.17066666666666666,
164
+ "grad_norm": 0.03401608020067215,
165
+ "learning_rate": 1.2792e-05,
166
+ "loss": 0.010398292541503906,
167
+ "mean_token_accuracy": 0.9966818282008171,
168
+ "num_tokens": 4897019.0,
169
+ "step": 1600
170
+ },
171
+ {
172
+ "entropy": 0.025755486716516316,
173
+ "epoch": 0.18133333333333335,
174
+ "grad_norm": 0.05834396556019783,
175
+ "learning_rate": 1.3592000000000001e-05,
176
+ "loss": 0.012847075462341309,
177
+ "mean_token_accuracy": 0.9960671140253544,
178
+ "num_tokens": 5880125.0,
179
+ "step": 1700
180
+ },
181
+ {
182
+ "entropy": 0.023504739217460154,
183
+ "epoch": 0.192,
184
+ "grad_norm": 0.09117468446493149,
185
+ "learning_rate": 1.4392e-05,
186
+ "loss": 0.011869451999664306,
187
+ "mean_token_accuracy": 0.9964955732226372,
188
+ "num_tokens": 6867360.0,
189
+ "step": 1800
190
+ },
191
+ {
192
+ "entropy": 0.022944011739455164,
193
+ "epoch": 0.20266666666666666,
194
+ "grad_norm": 0.046192608773708344,
195
+ "learning_rate": 1.4978666666666668e-05,
196
+ "loss": 0.010620262622833252,
197
+ "mean_token_accuracy": 0.9964672869443894,
198
+ "num_tokens": 7840472.0,
199
+ "step": 1900
200
+ },
201
+ {
202
+ "entropy": 0.018894561287015676,
203
+ "epoch": 0.21333333333333335,
204
+ "grad_norm": 0.02761668898165226,
205
+ "learning_rate": 1.4889777777777778e-05,
206
+ "loss": 0.0067046540975570675,
207
+ "mean_token_accuracy": 0.9975774252414703,
208
+ "num_tokens": 8827474.0,
209
+ "step": 2000
210
+ },
211
+ {
212
+ "entropy": 0.020934174589347095,
213
+ "epoch": 0.224,
214
+ "grad_norm": 0.02931295707821846,
215
+ "learning_rate": 1.4800888888888889e-05,
216
+ "loss": 0.010140993595123292,
217
+ "mean_token_accuracy": 0.996714953482151,
218
+ "num_tokens": 9808475.0,
219
+ "step": 2100
220
+ },
221
+ {
222
+ "entropy": 0.020719884738791734,
223
+ "epoch": 0.23466666666666666,
224
+ "grad_norm": 0.02652113325893879,
225
+ "learning_rate": 1.4712e-05,
226
+ "loss": 0.010557392835617066,
227
+ "mean_token_accuracy": 0.9969246552884579,
228
+ "num_tokens": 10782725.0,
229
+ "step": 2200
230
+ },
231
+ {
232
+ "entropy": 0.020559412932489068,
233
+ "epoch": 0.24533333333333332,
234
+ "grad_norm": 0.032906673848629,
235
+ "learning_rate": 1.4623111111111113e-05,
236
+ "loss": 0.009789772033691406,
237
+ "mean_token_accuracy": 0.9966975942254066,
238
+ "num_tokens": 11769893.0,
239
+ "step": 2300
240
+ },
241
+ {
242
+ "entropy": 0.018388252432923764,
243
+ "epoch": 0.256,
244
+ "grad_norm": 0.08162333816289902,
245
+ "learning_rate": 1.4534222222222222e-05,
246
+ "loss": 0.007909480929374695,
247
+ "mean_token_accuracy": 0.9972486282885075,
248
+ "num_tokens": 12753597.0,
249
+ "step": 2400
250
+ },
251
+ {
252
+ "entropy": 0.02204789153067395,
253
+ "epoch": 0.26666666666666666,
254
+ "grad_norm": 0.023511990904808044,
255
+ "learning_rate": 1.4445333333333334e-05,
256
+ "loss": 0.011945382356643677,
257
+ "mean_token_accuracy": 0.9963182592391968,
258
+ "num_tokens": 13739044.0,
259
+ "step": 2500
260
+ },
261
+ {
262
+ "entropy": 0.02037038065260276,
263
+ "epoch": 0.2773333333333333,
264
+ "grad_norm": 0.02014540508389473,
265
+ "learning_rate": 1.4356444444444446e-05,
266
+ "loss": 0.010165022611618042,
267
+ "mean_token_accuracy": 0.9967658732831478,
268
+ "num_tokens": 14709501.0,
269
+ "step": 2600
270
+ },
271
+ {
272
+ "entropy": 0.01947357293218374,
273
+ "epoch": 0.288,
274
+ "grad_norm": 0.01829116977751255,
275
+ "learning_rate": 1.4267555555555555e-05,
276
+ "loss": 0.009100326895713806,
277
+ "mean_token_accuracy": 0.9969173397123814,
278
+ "num_tokens": 15693309.0,
279
+ "step": 2700
280
+ },
281
+ {
282
+ "entropy": 0.021565243480727078,
283
+ "epoch": 0.2986666666666667,
284
+ "grad_norm": 0.031732361763715744,
285
+ "learning_rate": 1.4178666666666667e-05,
286
+ "loss": 0.01107092022895813,
287
+ "mean_token_accuracy": 0.9963353677093982,
288
+ "num_tokens": 985520.0,
289
+ "step": 2800
290
+ },
291
+ {
292
+ "entropy": 0.02136099517461844,
293
+ "epoch": 0.30933333333333335,
294
+ "grad_norm": 0.03870174661278725,
295
+ "learning_rate": 1.4089777777777779e-05,
296
+ "loss": 0.011569523811340332,
297
+ "mean_token_accuracy": 0.9964252272248268,
298
+ "num_tokens": 1969572.0,
299
+ "step": 2900
300
+ },
301
+ {
302
+ "entropy": 0.019074252294376492,
303
+ "epoch": 0.32,
304
+ "grad_norm": 0.01954864338040352,
305
+ "learning_rate": 1.4000888888888888e-05,
306
+ "loss": 0.008727578520774841,
307
+ "mean_token_accuracy": 0.9970971086621284,
308
+ "num_tokens": 2945404.0,
309
+ "step": 3000
310
+ },
311
+ {
312
+ "entropy": 0.018130726229865102,
313
+ "epoch": 0.33066666666666666,
314
+ "grad_norm": 0.01979902759194374,
315
+ "learning_rate": 1.3912e-05,
316
+ "loss": 0.007898266315460206,
317
+ "mean_token_accuracy": 0.9971335357427598,
318
+ "num_tokens": 3925122.0,
319
+ "step": 3100
320
+ },
321
+ {
322
+ "entropy": 0.019400757937546587,
323
+ "epoch": 0.3413333333333333,
324
+ "grad_norm": 0.030273959040641785,
325
+ "learning_rate": 1.0980206234443522e-05,
326
+ "loss": 0.009979066749413809,
327
+ "mean_token_accuracy": 0.9968534293584526,
328
+ "num_tokens": 945781.0,
329
+ "step": 3200
330
+ },
331
+ {
332
+ "entropy": 0.021496493350714446,
333
+ "epoch": 0.352,
334
+ "grad_norm": 0.023218955844640732,
335
+ "learning_rate": 1.0802417921061989e-05,
336
+ "loss": 0.011733214855194091,
337
+ "mean_token_accuracy": 0.996385814100504,
338
+ "num_tokens": 1919451.0,
339
+ "step": 3300
340
+ },
341
+ {
342
+ "entropy": 0.01932728004641831,
343
+ "epoch": 0.3626666666666667,
344
+ "grad_norm": 0.014179096557199955,
345
+ "learning_rate": 1.0624629607680455e-05,
346
+ "loss": 0.00976854920387268,
347
+ "mean_token_accuracy": 0.9968989025056362,
348
+ "num_tokens": 2892220.0,
349
+ "step": 3400
350
+ },
351
+ {
352
+ "entropy": 0.02088767145993188,
353
+ "epoch": 0.37333333333333335,
354
+ "grad_norm": 0.032297272235155106,
355
+ "learning_rate": 1.0446841294298921e-05,
356
+ "loss": 0.011026575565338134,
357
+ "mean_token_accuracy": 0.9964779444038868,
358
+ "num_tokens": 3878184.0,
359
+ "step": 3500
360
+ },
361
+ {
362
+ "entropy": 0.018377634914308463,
363
+ "epoch": 0.384,
364
+ "grad_norm": 0.012006225995719433,
365
+ "learning_rate": 1.3467555555555556e-05,
366
+ "loss": 0.008617221661235975,
367
+ "mean_token_accuracy": 0.9970365500320559,
368
+ "num_tokens": 904843.0,
369
+ "step": 3600
370
+ },
371
+ {
372
+ "entropy": 0.016050403744447977,
373
+ "epoch": 0.39466666666666667,
374
+ "grad_norm": 0.011128585785627365,
375
+ "learning_rate": 1.3378666666666666e-05,
376
+ "loss": 0.006250782608985901,
377
+ "mean_token_accuracy": 0.9976212471723557,
378
+ "num_tokens": 1878860.0,
379
+ "step": 3700
380
+ },
381
+ {
382
+ "entropy": 0.01809800002258271,
383
+ "epoch": 0.4053333333333333,
384
+ "grad_norm": 0.01952100545167923,
385
+ "learning_rate": 1.3289777777777778e-05,
386
+ "loss": 0.008064679503440857,
387
+ "mean_token_accuracy": 0.9971583542227745,
388
+ "num_tokens": 2860744.0,
389
+ "step": 3800
390
+ },
391
+ {
392
+ "entropy": 0.019823117861524225,
393
+ "epoch": 0.416,
394
+ "grad_norm": 0.011735321022570133,
395
+ "learning_rate": 1.3200888888888889e-05,
396
+ "loss": 0.010386246442794799,
397
+ "mean_token_accuracy": 0.9966941741108895,
398
+ "num_tokens": 3831565.0,
399
+ "step": 3900
400
+ },
401
+ {
402
+ "entropy": 0.02049923066298889,
403
+ "epoch": 0.4266666666666667,
404
+ "grad_norm": 0.006284466944634914,
405
+ "learning_rate": 1.3112e-05,
406
+ "loss": 0.01045264061107192,
407
+ "mean_token_accuracy": 0.9964593903616418,
408
+ "num_tokens": 849872.0,
409
+ "step": 4000
410
+ },
411
+ {
412
+ "entropy": 0.017120514765847476,
413
+ "epoch": 0.43733333333333335,
414
+ "grad_norm": 0.010025433264672756,
415
+ "learning_rate": 1.3023111111111111e-05,
416
+ "loss": 0.007535084486007691,
417
+ "mean_token_accuracy": 0.9973040929436684,
418
+ "num_tokens": 1826365.0,
419
+ "step": 4100
420
+ },
421
+ {
422
+ "entropy": 0.01937343619065359,
423
+ "epoch": 0.448,
424
+ "grad_norm": 0.008356385864317417,
425
+ "learning_rate": 1.2934222222222222e-05,
426
+ "loss": 0.010190980434417725,
427
+ "mean_token_accuracy": 0.9967715987563133,
428
+ "num_tokens": 2802749.0,
429
+ "step": 4200
430
+ },
431
+ {
432
+ "entropy": 0.017232202125014737,
433
+ "epoch": 0.45866666666666667,
434
+ "grad_norm": 0.011389357037842274,
435
+ "learning_rate": 1.2845333333333334e-05,
436
+ "loss": 0.007943087816238403,
437
+ "mean_token_accuracy": 0.9973043432831764,
438
+ "num_tokens": 3802758.0,
439
+ "step": 4300
440
+ },
441
+ {
442
+ "entropy": 0.022822092252748984,
443
+ "epoch": 0.4693333333333333,
444
+ "grad_norm": 0.017312563955783844,
445
+ "learning_rate": 1.2756444444444444e-05,
446
+ "loss": 0.012838510819423346,
447
+ "mean_token_accuracy": 0.9959132893953795,
448
+ "num_tokens": 789736.0,
449
+ "step": 4400
450
+ },
451
+ {
452
+ "entropy": 0.0184966369275935,
453
+ "epoch": 0.48,
454
+ "grad_norm": 0.04721185564994812,
455
+ "learning_rate": 1.2667555555555557e-05,
456
+ "loss": 0.009318522214889526,
457
+ "mean_token_accuracy": 0.9968563948571681,
458
+ "num_tokens": 1767600.0,
459
+ "step": 4500
460
+ },
461
+ {
462
+ "entropy": 0.021548866296652706,
463
+ "epoch": 0.49066666666666664,
464
+ "grad_norm": 0.9198243021965027,
465
+ "learning_rate": 1.2578666666666667e-05,
466
+ "loss": 0.013902791738510133,
467
+ "mean_token_accuracy": 0.9959381237626076,
468
+ "num_tokens": 2754982.0,
469
+ "step": 4600
470
+ },
471
+ {
472
+ "entropy": 0.0234364516264759,
473
+ "epoch": 0.5013333333333333,
474
+ "grad_norm": 0.011652004905045033,
475
+ "learning_rate": 1.2489777777777779e-05,
476
+ "loss": 0.011085785627365112,
477
+ "mean_token_accuracy": 0.9966410009562969,
478
+ "num_tokens": 3730109.0,
479
+ "step": 4700
480
+ },
481
+ {
482
+ "entropy": 0.017517962178529856,
483
+ "epoch": 0.512,
484
+ "grad_norm": 0.008591280318796635,
485
+ "learning_rate": 1.240088888888889e-05,
486
+ "loss": 0.0068675024168831965,
487
+ "mean_token_accuracy": 0.9973819294533173,
488
+ "num_tokens": 753786.0,
489
+ "step": 4800
490
+ },
491
+ {
492
+ "entropy": 0.019476991441333667,
493
+ "epoch": 0.5226666666666666,
494
+ "grad_norm": 0.013633953407406807,
495
+ "learning_rate": 1.2312e-05,
496
+ "loss": 0.009302983283996582,
497
+ "mean_token_accuracy": 0.9968732745945453,
498
+ "num_tokens": 1732738.0,
499
+ "step": 4900
500
+ },
501
+ {
502
+ "entropy": 0.020917424112558366,
503
+ "epoch": 0.5333333333333333,
504
+ "grad_norm": 0.01434489618986845,
505
+ "learning_rate": 1.2223111111111112e-05,
506
+ "loss": 0.010165597200393678,
507
+ "mean_token_accuracy": 0.9965643344819546,
508
+ "num_tokens": 2702069.0,
509
+ "step": 5000
510
+ },
511
+ {
512
+ "entropy": 0.017627096675569193,
513
+ "epoch": 0.544,
514
+ "grad_norm": 0.010028124786913395,
515
+ "learning_rate": 1.2134222222222223e-05,
516
+ "loss": 0.008581981062889099,
517
+ "mean_token_accuracy": 0.9972231885790825,
518
+ "num_tokens": 3692514.0,
519
+ "step": 5100
520
+ },
521
+ {
522
+ "entropy": 0.018209505494293832,
523
+ "epoch": 0.5546666666666666,
524
+ "grad_norm": 0.010386968962848186,
525
+ "learning_rate": 1.2045333333333333e-05,
526
+ "loss": 0.00814423689971099,
527
+ "mean_token_accuracy": 0.9972281313023051,
528
+ "num_tokens": 739184.0,
529
+ "step": 5200
530
+ },
531
+ {
532
+ "entropy": 0.02268622429575771,
533
+ "epoch": 0.5653333333333334,
534
+ "grad_norm": 0.01215888187289238,
535
+ "learning_rate": 1.1956444444444445e-05,
536
+ "loss": 0.012614715099334716,
537
+ "mean_token_accuracy": 0.9960222035646439,
538
+ "num_tokens": 1710404.0,
539
+ "step": 5300
540
+ },
541
+ {
542
+ "entropy": 0.020103816259652376,
543
+ "epoch": 0.576,
544
+ "grad_norm": 0.014174265787005424,
545
+ "learning_rate": 1.1867555555555556e-05,
546
+ "loss": 0.010200117826461791,
547
+ "mean_token_accuracy": 0.9967270520329475,
548
+ "num_tokens": 2686777.0,
549
+ "step": 5400
550
+ },
551
+ {
552
+ "entropy": 0.020467385197989643,
553
+ "epoch": 0.5866666666666667,
554
+ "grad_norm": 0.014257642440497875,
555
+ "learning_rate": 1.1778666666666666e-05,
556
+ "loss": 0.010185201168060303,
557
+ "mean_token_accuracy": 0.9966062535345555,
558
+ "num_tokens": 3655445.0,
559
+ "step": 5500
560
+ },
561
+ {
562
+ "entropy": 0.020591240752474878,
563
+ "epoch": 0.5973333333333334,
564
+ "grad_norm": 0.011690130457282066,
565
+ "learning_rate": 1.1689777777777778e-05,
566
+ "loss": 0.010783259419427402,
567
+ "mean_token_accuracy": 0.9964483192433482,
568
+ "num_tokens": 686164.0,
569
+ "step": 5600
570
+ },
571
+ {
572
+ "entropy": 0.0205141630244907,
573
+ "epoch": 0.608,
574
+ "grad_norm": 0.006469405256211758,
575
+ "learning_rate": 1.160088888888889e-05,
576
+ "loss": 0.010492314100265503,
577
+ "mean_token_accuracy": 0.9966046234965324,
578
+ "num_tokens": 1662167.0,
579
+ "step": 5700
580
+ },
581
+ {
582
+ "entropy": 0.020147288321750237,
583
+ "epoch": 0.6186666666666667,
584
+ "grad_norm": 0.01614871807396412,
585
+ "learning_rate": 1.1512e-05,
586
+ "loss": 0.010045292377471924,
587
+ "mean_token_accuracy": 0.9966985350847244,
588
+ "num_tokens": 2643221.0,
589
+ "step": 5800
590
+ },
591
+ {
592
+ "entropy": 0.01912423676229082,
593
+ "epoch": 0.6293333333333333,
594
+ "grad_norm": 0.010019957087934017,
595
+ "learning_rate": 1.1423111111111111e-05,
596
+ "loss": 0.009680591821670533,
597
+ "mean_token_accuracy": 0.9968334528803825,
598
+ "num_tokens": 3611899.0,
599
+ "step": 5900
600
+ },
601
+ {
602
+ "epoch": 0.16,
603
+ "grad_norm": 1.419013500213623,
604
+ "learning_rate": 1.2e-05,
605
+ "loss": 0.0443,
606
+ "step": 6000
607
+ },
608
+ {
609
+ "epoch": 0.16266666666666665,
610
+ "grad_norm": 0.0630747601389885,
611
+ "learning_rate": 1.22e-05,
612
+ "loss": 0.0066,
613
+ "step": 6100
614
+ },
615
+ {
616
+ "epoch": 0.16533333333333333,
617
+ "grad_norm": 0.08701734989881516,
618
+ "learning_rate": 1.24e-05,
619
+ "loss": 0.0171,
620
+ "step": 6200
621
+ },
622
+ {
623
+ "epoch": 0.168,
624
+ "grad_norm": 0.7274932861328125,
625
+ "learning_rate": 1.26e-05,
626
+ "loss": 0.0065,
627
+ "step": 6300
628
+ },
629
+ {
630
+ "epoch": 0.17066666666666666,
631
+ "grad_norm": 0.07388290017843246,
632
+ "learning_rate": 1.2800000000000001e-05,
633
+ "loss": 0.0087,
634
+ "step": 6400
635
+ },
636
+ {
637
+ "epoch": 0.17333333333333334,
638
+ "grad_norm": 0.045733992010354996,
639
+ "learning_rate": 1.3000000000000001e-05,
640
+ "loss": 0.0114,
641
+ "step": 6500
642
+ },
643
+ {
644
+ "epoch": 0.176,
645
+ "grad_norm": 0.4374091625213623,
646
+ "learning_rate": 1.32e-05,
647
+ "loss": 0.0139,
648
+ "step": 6600
649
+ },
650
+ {
651
+ "epoch": 0.17866666666666667,
652
+ "grad_norm": 0.050165899097919464,
653
+ "learning_rate": 1.34e-05,
654
+ "loss": 0.0145,
655
+ "step": 6700
656
+ },
657
+ {
658
+ "epoch": 0.18133333333333335,
659
+ "grad_norm": 0.047875139862298965,
660
+ "learning_rate": 1.36e-05,
661
+ "loss": 0.0101,
662
+ "step": 6800
663
+ }
664
+ ],
665
+ "logging_steps": 100,
666
+ "max_steps": 75000,
667
+ "num_input_tokens_seen": 0,
668
+ "num_train_epochs": 2,
669
+ "save_steps": 100,
670
+ "stateful_callbacks": {
671
+ "TrainerControl": {
672
+ "args": {
673
+ "should_epoch_stop": false,
674
+ "should_evaluate": false,
675
+ "should_log": false,
676
+ "should_save": true,
677
+ "should_training_stop": false
678
+ },
679
+ "attributes": {}
680
+ }
681
+ },
682
+ "total_flos": 3.5446173650588467e+18,
683
+ "train_batch_size": 4,
684
+ "trial_name": null,
685
+ "trial_params": null
686
+ }
checkpoint-6800/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c40a4c02bff4c325a39b29bbad54a640a82b1bc361c29cc2656ac8d29cf43eaa
3
+ size 5432
checkpoint-6900/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Dorn4449/CyberSentinel-Mistral-7B-v3.8
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.11.1
checkpoint-6900/adapter_config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "Dorn4449/CyberSentinel-Mistral-7B-v3.8",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 8,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "o_proj",
24
+ "v_proj",
25
+ "k_proj",
26
+ "q_proj"
27
+ ],
28
+ "task_type": "CAUSAL_LM",
29
+ "use_dora": false,
30
+ "use_rslora": false
31
+ }
checkpoint-6900/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fb16bd15c2d30833ede00f37a172efbf1cbd478e9aa01d936051c4064a692aa
3
+ size 27297032
checkpoint-6900/checkpoint-6000/adapter_config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "Dorn4449/CyberSentinel-Mistral-7B-v3.8",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 8,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "o_proj",
24
+ "v_proj",
25
+ "k_proj",
26
+ "q_proj"
27
+ ],
28
+ "task_type": "CAUSAL_LM",
29
+ "use_dora": false,
30
+ "use_rslora": false
31
+ }
checkpoint-6900/checkpoint-6000/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:286625e19ea92f6bcbc715bf5d40bac2ac85da8b9ecb31679d50c38cb4b4b694
3
+ size 27297032
checkpoint-6900/checkpoint-6100/adapter_config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "Dorn4449/CyberSentinel-Mistral-7B-v3.8",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 8,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "o_proj",
24
+ "v_proj",
25
+ "k_proj",
26
+ "q_proj"
27
+ ],
28
+ "task_type": "CAUSAL_LM",
29
+ "use_dora": false,
30
+ "use_rslora": false
31
+ }
checkpoint-6900/checkpoint-6100/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0a6ee2942c85113263a2331006fa30db088217df193b0fca3de2eebe282399b
3
+ size 27297032
checkpoint-6900/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:58edf8e29a66243e02f52a2373020e2d5895e90828764e844d326a9e3ca53dbd
3
+ size 54744314
checkpoint-6900/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6bd7d612ceb3e709a62706076d7ac3f7c5e669a74c0255776051d19f8d03b0c
3
+ size 14244
checkpoint-6900/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a98d5fe154c6c875e5ee23462b9940fd18916f76deaf1068c478a58799aa59cf
3
+ size 1064
checkpoint-6900/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
checkpoint-6900/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
3
+ size 493443
checkpoint-6900/tokenizer_config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "additional_special_tokens": [],
32
+ "bos_token": "<s>",
33
+ "chat_template": "{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content'] %}\n {%- set loop_messages = messages[1:] %}\n{%- else %}\n {%- set loop_messages = messages %}\n{%- endif %}\n\n{{- bos_token }}\n{%- for message in loop_messages %}\n {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}\n {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}\n {%- endif %}\n {%- if message['role'] == 'user' %}\n {%- if loop.first and system_message is defined %}\n {{- ' [INST] ' + system_message + '\\n\\n' + message['content'] + ' [/INST]' }}\n {%- else %}\n {{- ' [INST] ' + message['content'] + ' [/INST]' }}\n {%- endif %}\n {%- elif message['role'] == 'assistant' %}\n {{- ' ' + message['content'] + eos_token}}\n {%- else %}\n {{- raise_exception('Only user and assistant roles are supported, with the exception of an initial optional system message!') }}\n {%- endif %}\n{%- endfor %}\n",
34
+ "clean_up_tokenization_spaces": false,
35
+ "eos_token": "</s>",
36
+ "legacy": false,
37
+ "model_max_length": 1000000000000000019884624838656,
38
+ "pad_token": "</s>",
39
+ "sp_model_kwargs": {},
40
+ "spaces_between_special_tokens": false,
41
+ "tokenizer_class": "LlamaTokenizer",
42
+ "unk_token": "<unk>",
43
+ "use_default_system_prompt": false
44
+ }
checkpoint-6900/trainer_state.json ADDED
@@ -0,0 +1,693 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.184,
5
+ "eval_steps": 500,
6
+ "global_step": 6900,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "entropy": 1.084044404476881,
13
+ "epoch": 0.010666666666666666,
14
+ "grad_norm": 0.796875,
15
+ "learning_rate": 7.92e-07,
16
+ "loss": 1.3782466125488282,
17
+ "mean_token_accuracy": 0.7083857330679894,
18
+ "num_tokens": 986491.0,
19
+ "step": 100
20
+ },
21
+ {
22
+ "entropy": 1.1007767178118228,
23
+ "epoch": 0.021333333333333333,
24
+ "grad_norm": 0.3125,
25
+ "learning_rate": 1.592e-06,
26
+ "loss": 1.2902149963378906,
27
+ "mean_token_accuracy": 0.7171274860203266,
28
+ "num_tokens": 1952567.0,
29
+ "step": 200
30
+ },
31
+ {
32
+ "entropy": 1.0802835547924041,
33
+ "epoch": 0.032,
34
+ "grad_norm": 0.28515625,
35
+ "learning_rate": 2.392e-06,
36
+ "loss": 1.170371322631836,
37
+ "mean_token_accuracy": 0.7354862231016159,
38
+ "num_tokens": 2935479.0,
39
+ "step": 300
40
+ },
41
+ {
42
+ "entropy": 1.0573877203464508,
43
+ "epoch": 0.042666666666666665,
44
+ "grad_norm": 0.23046875,
45
+ "learning_rate": 3.192e-06,
46
+ "loss": 1.062843780517578,
47
+ "mean_token_accuracy": 0.7536254300177098,
48
+ "num_tokens": 3920344.0,
49
+ "step": 400
50
+ },
51
+ {
52
+ "entropy": 1.0149462178349495,
53
+ "epoch": 0.05333333333333334,
54
+ "grad_norm": 0.212890625,
55
+ "learning_rate": 3.992e-06,
56
+ "loss": 0.9590372467041015,
57
+ "mean_token_accuracy": 0.7738636130094528,
58
+ "num_tokens": 4899217.0,
59
+ "step": 500
60
+ },
61
+ {
62
+ "entropy": 0.8931056271493435,
63
+ "epoch": 0.064,
64
+ "grad_norm": 0.28125,
65
+ "learning_rate": 4.792e-06,
66
+ "loss": 0.7998863983154297,
67
+ "mean_token_accuracy": 0.8012332341074944,
68
+ "num_tokens": 5883391.0,
69
+ "step": 600
70
+ },
71
+ {
72
+ "entropy": 0.6136953190714121,
73
+ "epoch": 0.07466666666666667,
74
+ "grad_norm": 1.1015625,
75
+ "learning_rate": 5.592000000000001e-06,
76
+ "loss": 0.5131131362915039,
77
+ "mean_token_accuracy": 0.866335421949625,
78
+ "num_tokens": 6857125.0,
79
+ "step": 700
80
+ },
81
+ {
82
+ "entropy": 0.2726459547691047,
83
+ "epoch": 0.08533333333333333,
84
+ "grad_norm": 0.474609375,
85
+ "learning_rate": 6.392e-06,
86
+ "loss": 0.18439870834350586,
87
+ "mean_token_accuracy": 0.9579810079932213,
88
+ "num_tokens": 7831290.0,
89
+ "step": 800
90
+ },
91
+ {
92
+ "entropy": 0.11284050881862641,
93
+ "epoch": 0.096,
94
+ "grad_norm": 0.1875,
95
+ "learning_rate": 7.192e-06,
96
+ "loss": 0.05271556854248047,
97
+ "mean_token_accuracy": 0.9904264670610428,
98
+ "num_tokens": 8815417.0,
99
+ "step": 900
100
+ },
101
+ {
102
+ "entropy": 0.068436967888847,
103
+ "epoch": 0.10666666666666667,
104
+ "grad_norm": 0.125,
105
+ "learning_rate": 7.992e-06,
106
+ "loss": 0.02549468755722046,
107
+ "mean_token_accuracy": 0.9951217715442181,
108
+ "num_tokens": 9782033.0,
109
+ "step": 1000
110
+ },
111
+ {
112
+ "entropy": 0.052372096767649055,
113
+ "epoch": 0.11733333333333333,
114
+ "grad_norm": 0.1318359375,
115
+ "learning_rate": 8.792e-06,
116
+ "loss": 0.017259199619293213,
117
+ "mean_token_accuracy": 0.9962555834650993,
118
+ "num_tokens": 10759962.0,
119
+ "step": 1100
120
+ },
121
+ {
122
+ "entropy": 0.03862797610927373,
123
+ "epoch": 0.128,
124
+ "grad_norm": 0.0867946669459343,
125
+ "learning_rate": 9.591999999999999e-06,
126
+ "loss": 0.013324768543243408,
127
+ "mean_token_accuracy": 0.9968111206591129,
128
+ "num_tokens": 987986.0,
129
+ "step": 1200
130
+ },
131
+ {
132
+ "entropy": 0.032093781144358215,
133
+ "epoch": 0.13866666666666666,
134
+ "grad_norm": 0.037592533975839615,
135
+ "learning_rate": 1.0392e-05,
136
+ "loss": 0.013392001390457153,
137
+ "mean_token_accuracy": 0.9965760576725006,
138
+ "num_tokens": 1968127.0,
139
+ "step": 1300
140
+ },
141
+ {
142
+ "entropy": 0.031904329673852774,
143
+ "epoch": 0.14933333333333335,
144
+ "grad_norm": 0.10515860468149185,
145
+ "learning_rate": 1.1192e-05,
146
+ "loss": 0.016373103857040404,
147
+ "mean_token_accuracy": 0.9955054900050163,
148
+ "num_tokens": 2965677.0,
149
+ "step": 1400
150
+ },
151
+ {
152
+ "entropy": 0.0265167937008664,
153
+ "epoch": 0.16,
154
+ "grad_norm": 0.12689532339572906,
155
+ "learning_rate": 1.1992e-05,
156
+ "loss": 0.011554093360900878,
157
+ "mean_token_accuracy": 0.996508517563343,
158
+ "num_tokens": 3938144.0,
159
+ "step": 1500
160
+ },
161
+ {
162
+ "entropy": 0.024146563813555986,
163
+ "epoch": 0.17066666666666666,
164
+ "grad_norm": 0.03401608020067215,
165
+ "learning_rate": 1.2792e-05,
166
+ "loss": 0.010398292541503906,
167
+ "mean_token_accuracy": 0.9966818282008171,
168
+ "num_tokens": 4897019.0,
169
+ "step": 1600
170
+ },
171
+ {
172
+ "entropy": 0.025755486716516316,
173
+ "epoch": 0.18133333333333335,
174
+ "grad_norm": 0.05834396556019783,
175
+ "learning_rate": 1.3592000000000001e-05,
176
+ "loss": 0.012847075462341309,
177
+ "mean_token_accuracy": 0.9960671140253544,
178
+ "num_tokens": 5880125.0,
179
+ "step": 1700
180
+ },
181
+ {
182
+ "entropy": 0.023504739217460154,
183
+ "epoch": 0.192,
184
+ "grad_norm": 0.09117468446493149,
185
+ "learning_rate": 1.4392e-05,
186
+ "loss": 0.011869451999664306,
187
+ "mean_token_accuracy": 0.9964955732226372,
188
+ "num_tokens": 6867360.0,
189
+ "step": 1800
190
+ },
191
+ {
192
+ "entropy": 0.022944011739455164,
193
+ "epoch": 0.20266666666666666,
194
+ "grad_norm": 0.046192608773708344,
195
+ "learning_rate": 1.4978666666666668e-05,
196
+ "loss": 0.010620262622833252,
197
+ "mean_token_accuracy": 0.9964672869443894,
198
+ "num_tokens": 7840472.0,
199
+ "step": 1900
200
+ },
201
+ {
202
+ "entropy": 0.018894561287015676,
203
+ "epoch": 0.21333333333333335,
204
+ "grad_norm": 0.02761668898165226,
205
+ "learning_rate": 1.4889777777777778e-05,
206
+ "loss": 0.0067046540975570675,
207
+ "mean_token_accuracy": 0.9975774252414703,
208
+ "num_tokens": 8827474.0,
209
+ "step": 2000
210
+ },
211
+ {
212
+ "entropy": 0.020934174589347095,
213
+ "epoch": 0.224,
214
+ "grad_norm": 0.02931295707821846,
215
+ "learning_rate": 1.4800888888888889e-05,
216
+ "loss": 0.010140993595123292,
217
+ "mean_token_accuracy": 0.996714953482151,
218
+ "num_tokens": 9808475.0,
219
+ "step": 2100
220
+ },
221
+ {
222
+ "entropy": 0.020719884738791734,
223
+ "epoch": 0.23466666666666666,
224
+ "grad_norm": 0.02652113325893879,
225
+ "learning_rate": 1.4712e-05,
226
+ "loss": 0.010557392835617066,
227
+ "mean_token_accuracy": 0.9969246552884579,
228
+ "num_tokens": 10782725.0,
229
+ "step": 2200
230
+ },
231
+ {
232
+ "entropy": 0.020559412932489068,
233
+ "epoch": 0.24533333333333332,
234
+ "grad_norm": 0.032906673848629,
235
+ "learning_rate": 1.4623111111111113e-05,
236
+ "loss": 0.009789772033691406,
237
+ "mean_token_accuracy": 0.9966975942254066,
238
+ "num_tokens": 11769893.0,
239
+ "step": 2300
240
+ },
241
+ {
242
+ "entropy": 0.018388252432923764,
243
+ "epoch": 0.256,
244
+ "grad_norm": 0.08162333816289902,
245
+ "learning_rate": 1.4534222222222222e-05,
246
+ "loss": 0.007909480929374695,
247
+ "mean_token_accuracy": 0.9972486282885075,
248
+ "num_tokens": 12753597.0,
249
+ "step": 2400
250
+ },
251
+ {
252
+ "entropy": 0.02204789153067395,
253
+ "epoch": 0.26666666666666666,
254
+ "grad_norm": 0.023511990904808044,
255
+ "learning_rate": 1.4445333333333334e-05,
256
+ "loss": 0.011945382356643677,
257
+ "mean_token_accuracy": 0.9963182592391968,
258
+ "num_tokens": 13739044.0,
259
+ "step": 2500
260
+ },
261
+ {
262
+ "entropy": 0.02037038065260276,
263
+ "epoch": 0.2773333333333333,
264
+ "grad_norm": 0.02014540508389473,
265
+ "learning_rate": 1.4356444444444446e-05,
266
+ "loss": 0.010165022611618042,
267
+ "mean_token_accuracy": 0.9967658732831478,
268
+ "num_tokens": 14709501.0,
269
+ "step": 2600
270
+ },
271
+ {
272
+ "entropy": 0.01947357293218374,
273
+ "epoch": 0.288,
274
+ "grad_norm": 0.01829116977751255,
275
+ "learning_rate": 1.4267555555555555e-05,
276
+ "loss": 0.009100326895713806,
277
+ "mean_token_accuracy": 0.9969173397123814,
278
+ "num_tokens": 15693309.0,
279
+ "step": 2700
280
+ },
281
+ {
282
+ "entropy": 0.021565243480727078,
283
+ "epoch": 0.2986666666666667,
284
+ "grad_norm": 0.031732361763715744,
285
+ "learning_rate": 1.4178666666666667e-05,
286
+ "loss": 0.01107092022895813,
287
+ "mean_token_accuracy": 0.9963353677093982,
288
+ "num_tokens": 985520.0,
289
+ "step": 2800
290
+ },
291
+ {
292
+ "entropy": 0.02136099517461844,
293
+ "epoch": 0.30933333333333335,
294
+ "grad_norm": 0.03870174661278725,
295
+ "learning_rate": 1.4089777777777779e-05,
296
+ "loss": 0.011569523811340332,
297
+ "mean_token_accuracy": 0.9964252272248268,
298
+ "num_tokens": 1969572.0,
299
+ "step": 2900
300
+ },
301
+ {
302
+ "entropy": 0.019074252294376492,
303
+ "epoch": 0.32,
304
+ "grad_norm": 0.01954864338040352,
305
+ "learning_rate": 1.4000888888888888e-05,
306
+ "loss": 0.008727578520774841,
307
+ "mean_token_accuracy": 0.9970971086621284,
308
+ "num_tokens": 2945404.0,
309
+ "step": 3000
310
+ },
311
+ {
312
+ "entropy": 0.018130726229865102,
313
+ "epoch": 0.33066666666666666,
314
+ "grad_norm": 0.01979902759194374,
315
+ "learning_rate": 1.3912e-05,
316
+ "loss": 0.007898266315460206,
317
+ "mean_token_accuracy": 0.9971335357427598,
318
+ "num_tokens": 3925122.0,
319
+ "step": 3100
320
+ },
321
+ {
322
+ "entropy": 0.019400757937546587,
323
+ "epoch": 0.3413333333333333,
324
+ "grad_norm": 0.030273959040641785,
325
+ "learning_rate": 1.0980206234443522e-05,
326
+ "loss": 0.009979066749413809,
327
+ "mean_token_accuracy": 0.9968534293584526,
328
+ "num_tokens": 945781.0,
329
+ "step": 3200
330
+ },
331
+ {
332
+ "entropy": 0.021496493350714446,
333
+ "epoch": 0.352,
334
+ "grad_norm": 0.023218955844640732,
335
+ "learning_rate": 1.0802417921061989e-05,
336
+ "loss": 0.011733214855194091,
337
+ "mean_token_accuracy": 0.996385814100504,
338
+ "num_tokens": 1919451.0,
339
+ "step": 3300
340
+ },
341
+ {
342
+ "entropy": 0.01932728004641831,
343
+ "epoch": 0.3626666666666667,
344
+ "grad_norm": 0.014179096557199955,
345
+ "learning_rate": 1.0624629607680455e-05,
346
+ "loss": 0.00976854920387268,
347
+ "mean_token_accuracy": 0.9968989025056362,
348
+ "num_tokens": 2892220.0,
349
+ "step": 3400
350
+ },
351
+ {
352
+ "entropy": 0.02088767145993188,
353
+ "epoch": 0.37333333333333335,
354
+ "grad_norm": 0.032297272235155106,
355
+ "learning_rate": 1.0446841294298921e-05,
356
+ "loss": 0.011026575565338134,
357
+ "mean_token_accuracy": 0.9964779444038868,
358
+ "num_tokens": 3878184.0,
359
+ "step": 3500
360
+ },
361
+ {
362
+ "entropy": 0.018377634914308463,
363
+ "epoch": 0.384,
364
+ "grad_norm": 0.012006225995719433,
365
+ "learning_rate": 1.3467555555555556e-05,
366
+ "loss": 0.008617221661235975,
367
+ "mean_token_accuracy": 0.9970365500320559,
368
+ "num_tokens": 904843.0,
369
+ "step": 3600
370
+ },
371
+ {
372
+ "entropy": 0.016050403744447977,
373
+ "epoch": 0.39466666666666667,
374
+ "grad_norm": 0.011128585785627365,
375
+ "learning_rate": 1.3378666666666666e-05,
376
+ "loss": 0.006250782608985901,
377
+ "mean_token_accuracy": 0.9976212471723557,
378
+ "num_tokens": 1878860.0,
379
+ "step": 3700
380
+ },
381
+ {
382
+ "entropy": 0.01809800002258271,
383
+ "epoch": 0.4053333333333333,
384
+ "grad_norm": 0.01952100545167923,
385
+ "learning_rate": 1.3289777777777778e-05,
386
+ "loss": 0.008064679503440857,
387
+ "mean_token_accuracy": 0.9971583542227745,
388
+ "num_tokens": 2860744.0,
389
+ "step": 3800
390
+ },
391
+ {
392
+ "entropy": 0.019823117861524225,
393
+ "epoch": 0.416,
394
+ "grad_norm": 0.011735321022570133,
395
+ "learning_rate": 1.3200888888888889e-05,
396
+ "loss": 0.010386246442794799,
397
+ "mean_token_accuracy": 0.9966941741108895,
398
+ "num_tokens": 3831565.0,
399
+ "step": 3900
400
+ },
401
+ {
402
+ "entropy": 0.02049923066298889,
403
+ "epoch": 0.4266666666666667,
404
+ "grad_norm": 0.006284466944634914,
405
+ "learning_rate": 1.3112e-05,
406
+ "loss": 0.01045264061107192,
407
+ "mean_token_accuracy": 0.9964593903616418,
408
+ "num_tokens": 849872.0,
409
+ "step": 4000
410
+ },
411
+ {
412
+ "entropy": 0.017120514765847476,
413
+ "epoch": 0.43733333333333335,
414
+ "grad_norm": 0.010025433264672756,
415
+ "learning_rate": 1.3023111111111111e-05,
416
+ "loss": 0.007535084486007691,
417
+ "mean_token_accuracy": 0.9973040929436684,
418
+ "num_tokens": 1826365.0,
419
+ "step": 4100
420
+ },
421
+ {
422
+ "entropy": 0.01937343619065359,
423
+ "epoch": 0.448,
424
+ "grad_norm": 0.008356385864317417,
425
+ "learning_rate": 1.2934222222222222e-05,
426
+ "loss": 0.010190980434417725,
427
+ "mean_token_accuracy": 0.9967715987563133,
428
+ "num_tokens": 2802749.0,
429
+ "step": 4200
430
+ },
431
+ {
432
+ "entropy": 0.017232202125014737,
433
+ "epoch": 0.45866666666666667,
434
+ "grad_norm": 0.011389357037842274,
435
+ "learning_rate": 1.2845333333333334e-05,
436
+ "loss": 0.007943087816238403,
437
+ "mean_token_accuracy": 0.9973043432831764,
438
+ "num_tokens": 3802758.0,
439
+ "step": 4300
440
+ },
441
+ {
442
+ "entropy": 0.022822092252748984,
443
+ "epoch": 0.4693333333333333,
444
+ "grad_norm": 0.017312563955783844,
445
+ "learning_rate": 1.2756444444444444e-05,
446
+ "loss": 0.012838510819423346,
447
+ "mean_token_accuracy": 0.9959132893953795,
448
+ "num_tokens": 789736.0,
449
+ "step": 4400
450
+ },
451
+ {
452
+ "entropy": 0.0184966369275935,
453
+ "epoch": 0.48,
454
+ "grad_norm": 0.04721185564994812,
455
+ "learning_rate": 1.2667555555555557e-05,
456
+ "loss": 0.009318522214889526,
457
+ "mean_token_accuracy": 0.9968563948571681,
458
+ "num_tokens": 1767600.0,
459
+ "step": 4500
460
+ },
461
+ {
462
+ "entropy": 0.021548866296652706,
463
+ "epoch": 0.49066666666666664,
464
+ "grad_norm": 0.9198243021965027,
465
+ "learning_rate": 1.2578666666666667e-05,
466
+ "loss": 0.013902791738510133,
467
+ "mean_token_accuracy": 0.9959381237626076,
468
+ "num_tokens": 2754982.0,
469
+ "step": 4600
470
+ },
471
+ {
472
+ "entropy": 0.0234364516264759,
473
+ "epoch": 0.5013333333333333,
474
+ "grad_norm": 0.011652004905045033,
475
+ "learning_rate": 1.2489777777777779e-05,
476
+ "loss": 0.011085785627365112,
477
+ "mean_token_accuracy": 0.9966410009562969,
478
+ "num_tokens": 3730109.0,
479
+ "step": 4700
480
+ },
481
+ {
482
+ "entropy": 0.017517962178529856,
483
+ "epoch": 0.512,
484
+ "grad_norm": 0.008591280318796635,
485
+ "learning_rate": 1.240088888888889e-05,
486
+ "loss": 0.0068675024168831965,
487
+ "mean_token_accuracy": 0.9973819294533173,
488
+ "num_tokens": 753786.0,
489
+ "step": 4800
490
+ },
491
+ {
492
+ "entropy": 0.019476991441333667,
493
+ "epoch": 0.5226666666666666,
494
+ "grad_norm": 0.013633953407406807,
495
+ "learning_rate": 1.2312e-05,
496
+ "loss": 0.009302983283996582,
497
+ "mean_token_accuracy": 0.9968732745945453,
498
+ "num_tokens": 1732738.0,
499
+ "step": 4900
500
+ },
501
+ {
502
+ "entropy": 0.020917424112558366,
503
+ "epoch": 0.5333333333333333,
504
+ "grad_norm": 0.01434489618986845,
505
+ "learning_rate": 1.2223111111111112e-05,
506
+ "loss": 0.010165597200393678,
507
+ "mean_token_accuracy": 0.9965643344819546,
508
+ "num_tokens": 2702069.0,
509
+ "step": 5000
510
+ },
511
+ {
512
+ "entropy": 0.017627096675569193,
513
+ "epoch": 0.544,
514
+ "grad_norm": 0.010028124786913395,
515
+ "learning_rate": 1.2134222222222223e-05,
516
+ "loss": 0.008581981062889099,
517
+ "mean_token_accuracy": 0.9972231885790825,
518
+ "num_tokens": 3692514.0,
519
+ "step": 5100
520
+ },
521
+ {
522
+ "entropy": 0.018209505494293832,
523
+ "epoch": 0.5546666666666666,
524
+ "grad_norm": 0.010386968962848186,
525
+ "learning_rate": 1.2045333333333333e-05,
526
+ "loss": 0.00814423689971099,
527
+ "mean_token_accuracy": 0.9972281313023051,
528
+ "num_tokens": 739184.0,
529
+ "step": 5200
530
+ },
531
+ {
532
+ "entropy": 0.02268622429575771,
533
+ "epoch": 0.5653333333333334,
534
+ "grad_norm": 0.01215888187289238,
535
+ "learning_rate": 1.1956444444444445e-05,
536
+ "loss": 0.012614715099334716,
537
+ "mean_token_accuracy": 0.9960222035646439,
538
+ "num_tokens": 1710404.0,
539
+ "step": 5300
540
+ },
541
+ {
542
+ "entropy": 0.020103816259652376,
543
+ "epoch": 0.576,
544
+ "grad_norm": 0.014174265787005424,
545
+ "learning_rate": 1.1867555555555556e-05,
546
+ "loss": 0.010200117826461791,
547
+ "mean_token_accuracy": 0.9967270520329475,
548
+ "num_tokens": 2686777.0,
549
+ "step": 5400
550
+ },
551
+ {
552
+ "entropy": 0.020467385197989643,
553
+ "epoch": 0.5866666666666667,
554
+ "grad_norm": 0.014257642440497875,
555
+ "learning_rate": 1.1778666666666666e-05,
556
+ "loss": 0.010185201168060303,
557
+ "mean_token_accuracy": 0.9966062535345555,
558
+ "num_tokens": 3655445.0,
559
+ "step": 5500
560
+ },
561
+ {
562
+ "entropy": 0.020591240752474878,
563
+ "epoch": 0.5973333333333334,
564
+ "grad_norm": 0.011690130457282066,
565
+ "learning_rate": 1.1689777777777778e-05,
566
+ "loss": 0.010783259419427402,
567
+ "mean_token_accuracy": 0.9964483192433482,
568
+ "num_tokens": 686164.0,
569
+ "step": 5600
570
+ },
571
+ {
572
+ "entropy": 0.0205141630244907,
573
+ "epoch": 0.608,
574
+ "grad_norm": 0.006469405256211758,
575
+ "learning_rate": 1.160088888888889e-05,
576
+ "loss": 0.010492314100265503,
577
+ "mean_token_accuracy": 0.9966046234965324,
578
+ "num_tokens": 1662167.0,
579
+ "step": 5700
580
+ },
581
+ {
582
+ "entropy": 0.020147288321750237,
583
+ "epoch": 0.6186666666666667,
584
+ "grad_norm": 0.01614871807396412,
585
+ "learning_rate": 1.1512e-05,
586
+ "loss": 0.010045292377471924,
587
+ "mean_token_accuracy": 0.9966985350847244,
588
+ "num_tokens": 2643221.0,
589
+ "step": 5800
590
+ },
591
+ {
592
+ "entropy": 0.01912423676229082,
593
+ "epoch": 0.6293333333333333,
594
+ "grad_norm": 0.010019957087934017,
595
+ "learning_rate": 1.1423111111111111e-05,
596
+ "loss": 0.009680591821670533,
597
+ "mean_token_accuracy": 0.9968334528803825,
598
+ "num_tokens": 3611899.0,
599
+ "step": 5900
600
+ },
601
+ {
602
+ "epoch": 0.16,
603
+ "grad_norm": 1.419013500213623,
604
+ "learning_rate": 1.2e-05,
605
+ "loss": 0.0443,
606
+ "step": 6000
607
+ },
608
+ {
609
+ "epoch": 0.16266666666666665,
610
+ "grad_norm": 0.0630747601389885,
611
+ "learning_rate": 1.22e-05,
612
+ "loss": 0.0066,
613
+ "step": 6100
614
+ },
615
+ {
616
+ "epoch": 0.16533333333333333,
617
+ "grad_norm": 0.08701734989881516,
618
+ "learning_rate": 1.24e-05,
619
+ "loss": 0.0171,
620
+ "step": 6200
621
+ },
622
+ {
623
+ "epoch": 0.168,
624
+ "grad_norm": 0.7274932861328125,
625
+ "learning_rate": 1.26e-05,
626
+ "loss": 0.0065,
627
+ "step": 6300
628
+ },
629
+ {
630
+ "epoch": 0.17066666666666666,
631
+ "grad_norm": 0.07388290017843246,
632
+ "learning_rate": 1.2800000000000001e-05,
633
+ "loss": 0.0087,
634
+ "step": 6400
635
+ },
636
+ {
637
+ "epoch": 0.17333333333333334,
638
+ "grad_norm": 0.045733992010354996,
639
+ "learning_rate": 1.3000000000000001e-05,
640
+ "loss": 0.0114,
641
+ "step": 6500
642
+ },
643
+ {
644
+ "epoch": 0.176,
645
+ "grad_norm": 0.4374091625213623,
646
+ "learning_rate": 1.32e-05,
647
+ "loss": 0.0139,
648
+ "step": 6600
649
+ },
650
+ {
651
+ "epoch": 0.17866666666666667,
652
+ "grad_norm": 0.050165899097919464,
653
+ "learning_rate": 1.34e-05,
654
+ "loss": 0.0145,
655
+ "step": 6700
656
+ },
657
+ {
658
+ "epoch": 0.18133333333333335,
659
+ "grad_norm": 0.047875139862298965,
660
+ "learning_rate": 1.36e-05,
661
+ "loss": 0.0101,
662
+ "step": 6800
663
+ },
664
+ {
665
+ "epoch": 0.184,
666
+ "grad_norm": 0.06959123909473419,
667
+ "learning_rate": 1.3800000000000002e-05,
668
+ "loss": 0.0126,
669
+ "step": 6900
670
+ }
671
+ ],
672
+ "logging_steps": 100,
673
+ "max_steps": 75000,
674
+ "num_input_tokens_seen": 0,
675
+ "num_train_epochs": 2,
676
+ "save_steps": 100,
677
+ "stateful_callbacks": {
678
+ "TrainerControl": {
679
+ "args": {
680
+ "should_epoch_stop": false,
681
+ "should_evaluate": false,
682
+ "should_log": false,
683
+ "should_save": true,
684
+ "should_training_stop": false
685
+ },
686
+ "attributes": {}
687
+ }
688
+ },
689
+ "total_flos": 3.559366092381192e+18,
690
+ "train_batch_size": 4,
691
+ "trial_name": null,
692
+ "trial_params": null
693
+ }
checkpoint-6900/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c40a4c02bff4c325a39b29bbad54a640a82b1bc361c29cc2656ac8d29cf43eaa
3
+ size 5432