Ennon commited on
Commit
9ca9e8b
·
verified ·
1 Parent(s): 2751b76

Gemma 2 9B DevOps - Polish finetuned model

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - pl
4
+ - en
5
+ license: mit
6
+ tags:
7
+ - devops
8
+ - kubernetes
9
+ - ansible
10
+ - terraform
11
+ - yaml
12
+ base_model: google/gemma-2-9b-it
13
+ ---
14
+
15
+ # Phi-3.5-mini-PL-DevOps-Instruct-v2
16
+
17
+ Polish DevOps assistant fine-tuned on Infrastructure as Code tasks.
18
+
19
+ ## ⚠️ Fixes in v2
20
+ - **Fixed YAML indentation** - consistent 2-space indentation
21
+ - **High Quality Training** - Native BF16 training (no quantization errors)
22
+ - Trained WITHOUT Unsloth (no padding-free mode)
23
+ - `packing=False` to preserve whitespace
24
+
25
+ ## Evaluation / Inference
26
+ This model is saved in **BFLOAT16**.
27
+ - For 4-bit inference: Load with `load_in_4bit=True` (bitsandbytes)
28
+ - For vLLM: Compatible with standard loading or FP8/AWQ quantization
29
+
30
+ ## Training
31
+ | Param | Value |
32
+ |-------|-------|
33
+ | Base | google/gemma-2-9b-it |
34
+ | Method | Full BF16 Finetuning + LoRA |
35
+ | Batch | 96 effective |
36
+ | Train samples | 170,305 |
37
+ | Train loss | 0.6174 |
38
+ | Time | 667.0 min |
39
+ | GPU | H100 80GB |
chat_template.jinja ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '
2
+ ' + message['content'] | trim + '<end_of_turn>
3
+ ' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model
4
+ '}}{% endif %}
config.json ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Gemma2ForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "attn_logit_softcapping": 50.0,
8
+ "bos_token_id": 2,
9
+ "cache_implementation": "hybrid",
10
+ "dtype": "bfloat16",
11
+ "eos_token_id": 1,
12
+ "final_logit_softcapping": 30.0,
13
+ "head_dim": 256,
14
+ "hidden_act": "gelu_pytorch_tanh",
15
+ "hidden_activation": "gelu_pytorch_tanh",
16
+ "hidden_size": 3584,
17
+ "initializer_range": 0.02,
18
+ "intermediate_size": 14336,
19
+ "layer_types": [
20
+ "sliding_attention",
21
+ "full_attention",
22
+ "sliding_attention",
23
+ "full_attention",
24
+ "sliding_attention",
25
+ "full_attention",
26
+ "sliding_attention",
27
+ "full_attention",
28
+ "sliding_attention",
29
+ "full_attention",
30
+ "sliding_attention",
31
+ "full_attention",
32
+ "sliding_attention",
33
+ "full_attention",
34
+ "sliding_attention",
35
+ "full_attention",
36
+ "sliding_attention",
37
+ "full_attention",
38
+ "sliding_attention",
39
+ "full_attention",
40
+ "sliding_attention",
41
+ "full_attention",
42
+ "sliding_attention",
43
+ "full_attention",
44
+ "sliding_attention",
45
+ "full_attention",
46
+ "sliding_attention",
47
+ "full_attention",
48
+ "sliding_attention",
49
+ "full_attention",
50
+ "sliding_attention",
51
+ "full_attention",
52
+ "sliding_attention",
53
+ "full_attention",
54
+ "sliding_attention",
55
+ "full_attention",
56
+ "sliding_attention",
57
+ "full_attention",
58
+ "sliding_attention",
59
+ "full_attention",
60
+ "sliding_attention",
61
+ "full_attention"
62
+ ],
63
+ "max_position_embeddings": 8192,
64
+ "model_type": "gemma2",
65
+ "num_attention_heads": 16,
66
+ "num_hidden_layers": 42,
67
+ "num_key_value_heads": 8,
68
+ "pad_token_id": 0,
69
+ "query_pre_attn_scalar": 256,
70
+ "rms_norm_eps": 1e-06,
71
+ "rope_parameters": {
72
+ "rope_theta": 10000.0,
73
+ "rope_type": "default"
74
+ },
75
+ "sliding_window": 4096,
76
+ "sliding_window_size": 4096,
77
+ "tie_word_embeddings": true,
78
+ "transformers_version": "5.0.0",
79
+ "use_bidirectional_attention": null,
80
+ "use_cache": false,
81
+ "vocab_size": 256000
82
+ }
generation_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 2,
4
+ "cache_implementation": "hybrid",
5
+ "eos_token_id": [
6
+ 1
7
+ ],
8
+ "pad_token_id": 0,
9
+ "transformers_version": "5.0.0"
10
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e79c3ec83bb4df5182eb6057df9c8df02d95dbc5d9ccf69e8cfb81dfce671589
3
+ size 18483467000
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:394ace002a144ac6ad5486387502f2d36f70c087310c3d907857240c76fcb36e
3
+ size 34362748
tokenizer_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "bos_token": "<bos>",
4
+ "clean_up_tokenization_spaces": false,
5
+ "eos_token": "<eos>",
6
+ "extra_special_tokens": [
7
+ "<start_of_turn>",
8
+ "<end_of_turn>"
9
+ ],
10
+ "is_local": false,
11
+ "mask_token": "<mask>",
12
+ "model_max_length": 2048,
13
+ "pad_token": "<pad>",
14
+ "sp_model_kwargs": {},
15
+ "spaces_between_special_tokens": false,
16
+ "tokenizer_class": "GemmaTokenizer",
17
+ "unk_token": "<unk>",
18
+ "use_default_system_prompt": false
19
+ }
trainer_log_history.json ADDED
@@ -0,0 +1,908 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "loss": 1.9148719787597657,
4
+ "grad_norm": 0.984495222568512,
5
+ "learning_rate": 2.2641509433962265e-05,
6
+ "entropy": 0.9785909144083659,
7
+ "num_tokens": 909468.0,
8
+ "mean_token_accuracy": 0.6837772730986277,
9
+ "epoch": 0.014091122592766557,
10
+ "step": 25
11
+ },
12
+ {
13
+ "loss": 1.0547267150878907,
14
+ "grad_norm": 0.2687967121601105,
15
+ "learning_rate": 4.6226415094339625e-05,
16
+ "entropy": 1.0105916921297708,
17
+ "num_tokens": 1841827.0,
18
+ "mean_token_accuracy": 0.7993326298395793,
19
+ "epoch": 0.028182245185533115,
20
+ "step": 50
21
+ },
22
+ {
23
+ "loss": 0.909715805053711,
24
+ "grad_norm": 0.30847716331481934,
25
+ "learning_rate": 4.998165452627025e-05,
26
+ "entropy": 0.8024396904309591,
27
+ "num_tokens": 2783417.0,
28
+ "mean_token_accuracy": 0.8195314351717631,
29
+ "epoch": 0.042273367778299674,
30
+ "step": 75
31
+ },
32
+ {
33
+ "loss": 0.8271210479736328,
34
+ "grad_norm": 0.2949310839176178,
35
+ "learning_rate": 4.991201589453377e-05,
36
+ "entropy": 0.7632828823725383,
37
+ "num_tokens": 3703024.0,
38
+ "mean_token_accuracy": 0.8274987975756327,
39
+ "epoch": 0.05636449037106623,
40
+ "step": 100
41
+ },
42
+ {
43
+ "eval_loss": 0.7959097623825073,
44
+ "eval_runtime": 42.9182,
45
+ "eval_samples_per_second": 11.65,
46
+ "eval_steps_per_second": 0.746,
47
+ "eval_entropy": 0.7644990533590317,
48
+ "eval_num_tokens": 3703024.0,
49
+ "eval_mean_token_accuracy": 0.8274666927754879,
50
+ "epoch": 0.05636449037106623,
51
+ "step": 100
52
+ },
53
+ {
54
+ "loss": 0.7740676879882813,
55
+ "grad_norm": 0.29744288325309753,
56
+ "learning_rate": 4.97905632708703e-05,
57
+ "entropy": 0.7656655506292979,
58
+ "num_tokens": 4667355.0,
59
+ "mean_token_accuracy": 0.8285713505744934,
60
+ "epoch": 0.07045561296383279,
61
+ "step": 125
62
+ },
63
+ {
64
+ "loss": 0.7345146942138672,
65
+ "grad_norm": 0.3392024040222168,
66
+ "learning_rate": 4.9617549262105724e-05,
67
+ "entropy": 0.7322683656215667,
68
+ "num_tokens": 5580909.0,
69
+ "mean_token_accuracy": 0.8329473527272543,
70
+ "epoch": 0.08454673555659935,
71
+ "step": 150
72
+ },
73
+ {
74
+ "loss": 0.7085108184814453,
75
+ "grad_norm": 0.337108314037323,
76
+ "learning_rate": 4.939333371653541e-05,
77
+ "entropy": 0.7126858182748159,
78
+ "num_tokens": 6525675.0,
79
+ "mean_token_accuracy": 0.8382143716017405,
80
+ "epoch": 0.0986378581493659,
81
+ "step": 175
82
+ },
83
+ {
84
+ "loss": 0.7087242889404297,
85
+ "grad_norm": 0.39108389616012573,
86
+ "learning_rate": 4.911838297548306e-05,
87
+ "entropy": 0.7103402439753215,
88
+ "num_tokens": 7460420.0,
89
+ "mean_token_accuracy": 0.8385978392759958,
90
+ "epoch": 0.11272898074213246,
91
+ "step": 200
92
+ },
93
+ {
94
+ "eval_loss": 0.691197395324707,
95
+ "eval_runtime": 34.7324,
96
+ "eval_samples_per_second": 14.396,
97
+ "eval_steps_per_second": 0.921,
98
+ "eval_entropy": 0.7021831637248397,
99
+ "eval_num_tokens": 7460420.0,
100
+ "eval_mean_token_accuracy": 0.8397158589214087,
101
+ "epoch": 0.11272898074213246,
102
+ "step": 200
103
+ },
104
+ {
105
+ "loss": 0.6759407043457031,
106
+ "grad_norm": 0.41262030601501465,
107
+ "learning_rate": 4.8793268903366905e-05,
108
+ "entropy": 0.6836405583222707,
109
+ "num_tokens": 8367608.0,
110
+ "mean_token_accuracy": 0.8437444992860158,
111
+ "epoch": 0.12682010333489901,
112
+ "step": 225
113
+ },
114
+ {
115
+ "loss": 0.6790435028076172,
116
+ "grad_norm": 0.4088114798069,
117
+ "learning_rate": 4.8418667698290696e-05,
118
+ "entropy": 0.684131217400233,
119
+ "num_tokens": 9284184.0,
120
+ "mean_token_accuracy": 0.8435306719938914,
121
+ "epoch": 0.14091122592766558,
122
+ "step": 250
123
+ },
124
+ {
125
+ "loss": 0.6590489959716797,
126
+ "grad_norm": 0.4000810384750366,
127
+ "learning_rate": 4.7995358485633035e-05,
128
+ "entropy": 0.6666705779234569,
129
+ "num_tokens": 10228116.0,
130
+ "mean_token_accuracy": 0.8483462047576904,
131
+ "epoch": 0.15500234852043213,
132
+ "step": 275
133
+ },
134
+ {
135
+ "loss": 0.6522020721435546,
136
+ "grad_norm": 0.4356841742992401,
137
+ "learning_rate": 4.752422169756048e-05,
138
+ "entropy": 0.6561659761269887,
139
+ "num_tokens": 11141987.0,
140
+ "mean_token_accuracy": 0.8489574348926544,
141
+ "epoch": 0.1690934711131987,
142
+ "step": 300
143
+ },
144
+ {
145
+ "eval_loss": 0.6456555724143982,
146
+ "eval_runtime": 34.7464,
147
+ "eval_samples_per_second": 14.39,
148
+ "eval_steps_per_second": 0.921,
149
+ "eval_entropy": 0.6593516366556287,
150
+ "eval_num_tokens": 11141987.0,
151
+ "eval_mean_token_accuracy": 0.8488058932125568,
152
+ "epoch": 0.1690934711131987,
153
+ "step": 300
154
+ },
155
+ {
156
+ "loss": 0.6289921569824218,
157
+ "grad_norm": 0.44314464926719666,
158
+ "learning_rate": 4.700623724183468e-05,
159
+ "entropy": 0.6275538243850072,
160
+ "num_tokens": 12066391.0,
161
+ "mean_token_accuracy": 0.8543656957149506,
162
+ "epoch": 0.18318459370596524,
163
+ "step": 325
164
+ },
165
+ {
166
+ "loss": 0.6266510009765625,
167
+ "grad_norm": 0.4457905888557434,
168
+ "learning_rate": 4.644248246372233e-05,
169
+ "entropy": 0.6246062052249909,
170
+ "num_tokens": 13002518.0,
171
+ "mean_token_accuracy": 0.8540003776550293,
172
+ "epoch": 0.1972757162987318,
173
+ "step": 350
174
+ },
175
+ {
176
+ "loss": 0.6241617965698242,
177
+ "grad_norm": 0.45250752568244934,
178
+ "learning_rate": 4.5834129905246725e-05,
179
+ "entropy": 0.6225514455636343,
180
+ "num_tokens": 13915051.0,
181
+ "mean_token_accuracy": 0.8545078063011169,
182
+ "epoch": 0.21136683889149835,
183
+ "step": 375
184
+ },
185
+ {
186
+ "loss": 0.6225375366210938,
187
+ "grad_norm": 0.4502236545085907,
188
+ "learning_rate": 4.5182444866441694e-05,
189
+ "entropy": 0.6170252589384715,
190
+ "num_tokens": 14840689.0,
191
+ "mean_token_accuracy": 0.8547838560740153,
192
+ "epoch": 0.22545796148426492,
193
+ "step": 400
194
+ },
195
+ {
196
+ "eval_loss": 0.624234139919281,
197
+ "eval_runtime": 34.7767,
198
+ "eval_samples_per_second": 14.377,
199
+ "eval_steps_per_second": 0.92,
200
+ "eval_entropy": 0.6146921720355749,
201
+ "eval_num_tokens": 14840689.0,
202
+ "eval_mean_token_accuracy": 0.8521482553333044,
203
+ "epoch": 0.22545796148426492,
204
+ "step": 400
205
+ },
206
+ {
207
+ "loss": 0.6120803451538086,
208
+ "grad_norm": 0.40877044200897217,
209
+ "learning_rate": 4.4488782773679885e-05,
210
+ "entropy": 0.6126995925108591,
211
+ "num_tokens": 15781641.0,
212
+ "mean_token_accuracy": 0.8559599355856577,
213
+ "epoch": 0.23954908407703146,
214
+ "step": 425
215
+ },
216
+ {
217
+ "loss": 0.6238847732543945,
218
+ "grad_norm": 0.43226659297943115,
219
+ "learning_rate": 4.375458636054924e-05,
220
+ "entropy": 0.621622064312299,
221
+ "num_tokens": 16727723.0,
222
+ "mean_token_accuracy": 0.8534450817108155,
223
+ "epoch": 0.25364020666979803,
224
+ "step": 450
225
+ },
226
+ {
227
+ "loss": 0.5980339431762696,
228
+ "grad_norm": 0.4249129295349121,
229
+ "learning_rate": 4.298138266714094e-05,
230
+ "entropy": 0.5939697621266047,
231
+ "num_tokens": 17644465.0,
232
+ "mean_token_accuracy": 0.8589934686819712,
233
+ "epoch": 0.2677313292625646,
234
+ "step": 475
235
+ },
236
+ {
237
+ "loss": 0.6099626541137695,
238
+ "grad_norm": 0.4274967908859253,
239
+ "learning_rate": 4.2170779863989946e-05,
240
+ "entropy": 0.6078906120856603,
241
+ "num_tokens": 18563256.0,
242
+ "mean_token_accuracy": 0.8569075318177541,
243
+ "epoch": 0.28182245185533117,
244
+ "step": 500
245
+ },
246
+ {
247
+ "eval_loss": 0.6081598997116089,
248
+ "eval_runtime": 34.7944,
249
+ "eval_samples_per_second": 14.37,
250
+ "eval_steps_per_second": 0.92,
251
+ "eval_entropy": 0.6181821776553988,
252
+ "eval_num_tokens": 18563256.0,
253
+ "eval_mean_token_accuracy": 0.855755690485239,
254
+ "epoch": 0.28182245185533117,
255
+ "step": 500
256
+ },
257
+ {
258
+ "loss": 0.5912541961669922,
259
+ "grad_norm": 0.4574773907661438,
260
+ "learning_rate": 4.132446390727404e-05,
261
+ "entropy": 0.5872368462880453,
262
+ "num_tokens": 19469800.0,
263
+ "mean_token_accuracy": 0.8607503294944763,
264
+ "epoch": 0.2959135744480977,
265
+ "step": 525
266
+ },
267
+ {
268
+ "loss": 0.5932905197143554,
269
+ "grad_norm": 0.43131619691848755,
270
+ "learning_rate": 4.044419503222808e-05,
271
+ "entropy": 0.592293497522672,
272
+ "num_tokens": 20385241.0,
273
+ "mean_token_accuracy": 0.8604243552684784,
274
+ "epoch": 0.31000469704086425,
275
+ "step": 550
276
+ },
277
+ {
278
+ "loss": 0.5899901580810547,
279
+ "grad_norm": 0.4174489974975586,
280
+ "learning_rate": 3.953180409206677e-05,
281
+ "entropy": 0.5856852753957112,
282
+ "num_tokens": 21331318.0,
283
+ "mean_token_accuracy": 0.8611550823847453,
284
+ "epoch": 0.3240958196336308,
285
+ "step": 575
286
+ },
287
+ {
288
+ "loss": 0.600746955871582,
289
+ "grad_norm": 0.4607154428958893,
290
+ "learning_rate": 3.858918875003053e-05,
291
+ "entropy": 0.5992459511756897,
292
+ "num_tokens": 22288698.0,
293
+ "mean_token_accuracy": 0.8587616598606109,
294
+ "epoch": 0.3381869422263974,
295
+ "step": 600
296
+ },
297
+ {
298
+ "eval_loss": 0.5959565043449402,
299
+ "eval_runtime": 34.7871,
300
+ "eval_samples_per_second": 14.373,
301
+ "eval_steps_per_second": 0.92,
302
+ "eval_entropy": 0.6024799766018987,
303
+ "eval_num_tokens": 22288698.0,
304
+ "eval_mean_token_accuracy": 0.8582040295004845,
305
+ "epoch": 0.3381869422263974,
306
+ "step": 600
307
+ },
308
+ {
309
+ "loss": 0.5945447540283203,
310
+ "grad_norm": 0.4672609269618988,
311
+ "learning_rate": 3.761830953247457e-05,
312
+ "entropy": 0.5911998764673869,
313
+ "num_tokens": 23239625.0,
314
+ "mean_token_accuracy": 0.8611347631613413,
315
+ "epoch": 0.3522780648191639,
316
+ "step": 625
317
+ },
318
+ {
319
+ "loss": 0.5985645294189453,
320
+ "grad_norm": 0.4281597137451172,
321
+ "learning_rate": 3.662118575121024e-05,
322
+ "entropy": 0.5953911445538203,
323
+ "num_tokens": 24156885.0,
324
+ "mean_token_accuracy": 0.859769054253896,
325
+ "epoch": 0.3663691874119305,
326
+ "step": 650
327
+ },
328
+ {
329
+ "loss": 0.5921562957763672,
330
+ "grad_norm": 0.43555110692977905,
331
+ "learning_rate": 3.5599891303579746e-05,
332
+ "entropy": 0.5895072638988494,
333
+ "num_tokens": 25104558.0,
334
+ "mean_token_accuracy": 0.860583526690801,
335
+ "epoch": 0.38046031000469704,
336
+ "step": 675
337
+ },
338
+ {
339
+ "loss": 0.5891357421875,
340
+ "grad_norm": 0.46079888939857483,
341
+ "learning_rate": 3.455655035899951e-05,
342
+ "entropy": 0.5860749536752701,
343
+ "num_tokens": 26027947.0,
344
+ "mean_token_accuracy": 0.8607413911819458,
345
+ "epoch": 0.3945514325974636,
346
+ "step": 700
347
+ },
348
+ {
349
+ "eval_loss": 0.5854880213737488,
350
+ "eval_runtime": 34.8333,
351
+ "eval_samples_per_second": 14.354,
352
+ "eval_steps_per_second": 0.919,
353
+ "eval_entropy": 0.5855442956089973,
354
+ "eval_num_tokens": 26027947.0,
355
+ "eval_mean_token_accuracy": 0.8601280357688665,
356
+ "epoch": 0.3945514325974636,
357
+ "step": 700
358
+ },
359
+ {
360
+ "loss": 0.5821422576904297,
361
+ "grad_norm": 0.42215803265571594,
362
+ "learning_rate": 3.349333294094369e-05,
363
+ "entropy": 0.5828985869884491,
364
+ "num_tokens": 26996995.0,
365
+ "mean_token_accuracy": 0.8623941914240519,
366
+ "epoch": 0.40864255519023013,
367
+ "step": 725
368
+ },
369
+ {
370
+ "loss": 0.5734980392456055,
371
+ "grad_norm": 0.4118139147758484,
372
+ "learning_rate": 3.241245041355675e-05,
373
+ "entropy": 0.5695036280155182,
374
+ "num_tokens": 27948817.0,
375
+ "mean_token_accuracy": 0.8648126033941904,
376
+ "epoch": 0.4227336777829967,
377
+ "step": 750
378
+ },
379
+ {
380
+ "loss": 0.5755558776855468,
381
+ "grad_norm": 0.40968823432922363,
382
+ "learning_rate": 3.131615088228249e-05,
383
+ "entropy": 0.5767549270391464,
384
+ "num_tokens": 28893932.0,
385
+ "mean_token_accuracy": 0.8637475728988647,
386
+ "epoch": 0.43682480037576327,
387
+ "step": 775
388
+ },
389
+ {
390
+ "loss": 0.573729248046875,
391
+ "grad_norm": 0.4324798583984375,
392
+ "learning_rate": 3.0206714518075486e-05,
393
+ "entropy": 0.5696792916456859,
394
+ "num_tokens": 29833216.0,
395
+ "mean_token_accuracy": 0.8643758261203766,
396
+ "epoch": 0.45091592296852984,
397
+ "step": 800
398
+ },
399
+ {
400
+ "eval_loss": 0.5752155780792236,
401
+ "eval_runtime": 34.8957,
402
+ "eval_samples_per_second": 14.328,
403
+ "eval_steps_per_second": 0.917,
404
+ "eval_entropy": 0.5935880783945322,
405
+ "eval_num_tokens": 29833216.0,
406
+ "eval_mean_token_accuracy": 0.8622864987701178,
407
+ "epoch": 0.45091592296852984,
408
+ "step": 800
409
+ },
410
+ {
411
+ "loss": 0.5751077270507813,
412
+ "grad_norm": 0.4815407693386078,
413
+ "learning_rate": 2.9086448814920242e-05,
414
+ "entropy": 0.5717160554726919,
415
+ "num_tokens": 30736838.0,
416
+ "mean_token_accuracy": 0.864310040473938,
417
+ "epoch": 0.4650070455612964,
418
+ "step": 825
419
+ },
420
+ {
421
+ "loss": 0.566772575378418,
422
+ "grad_norm": 0.4774300158023834,
423
+ "learning_rate": 2.7957683790521676e-05,
424
+ "entropy": 0.5650917081038157,
425
+ "num_tokens": 31659300.0,
426
+ "mean_token_accuracy": 0.8658999156951904,
427
+ "epoch": 0.4790981681540629,
428
+ "step": 850
429
+ },
430
+ {
431
+ "loss": 0.5626054382324219,
432
+ "grad_norm": 0.42420145869255066,
433
+ "learning_rate": 2.6822767140148987e-05,
434
+ "entropy": 0.5590727700789769,
435
+ "num_tokens": 32593580.0,
436
+ "mean_token_accuracy": 0.8666303022702535,
437
+ "epoch": 0.4931892907468295,
438
+ "step": 875
439
+ },
440
+ {
441
+ "loss": 0.5539141082763672,
442
+ "grad_norm": 0.47889477014541626,
443
+ "learning_rate": 2.5684059353712307e-05,
444
+ "entropy": 0.5530497090021769,
445
+ "num_tokens": 33494838.0,
446
+ "mean_token_accuracy": 0.8674623111883799,
447
+ "epoch": 0.5072804133395961,
448
+ "step": 900
449
+ },
450
+ {
451
+ "eval_loss": 0.5693426728248596,
452
+ "eval_runtime": 34.8643,
453
+ "eval_samples_per_second": 14.341,
454
+ "eval_steps_per_second": 0.918,
455
+ "eval_entropy": 0.5721144182607532,
456
+ "eval_num_tokens": 33494838.0,
457
+ "eval_mean_token_accuracy": 0.8636170122772455,
458
+ "epoch": 0.5072804133395961,
459
+ "step": 900
460
+ },
461
+ {
462
+ "loss": 0.5634239959716797,
463
+ "grad_norm": 0.47955217957496643,
464
+ "learning_rate": 2.4543928806228074e-05,
465
+ "entropy": 0.562302614847819,
466
+ "num_tokens": 34443337.0,
467
+ "mean_token_accuracy": 0.8664345097541809,
468
+ "epoch": 0.5213715359323626,
469
+ "step": 925
470
+ },
471
+ {
472
+ "loss": 0.5764046096801758,
473
+ "grad_norm": 0.4992325007915497,
474
+ "learning_rate": 2.340474683188429e-05,
475
+ "entropy": 0.570437356432279,
476
+ "num_tokens": 35385705.0,
477
+ "mean_token_accuracy": 0.8647123599052429,
478
+ "epoch": 0.5354626585251292,
479
+ "step": 950
480
+ },
481
+ {
482
+ "loss": 0.5535079956054687,
483
+ "grad_norm": 0.5063010454177856,
484
+ "learning_rate": 2.2268882791951127e-05,
485
+ "entropy": 0.5491390575965246,
486
+ "num_tokens": 36300339.0,
487
+ "mean_token_accuracy": 0.8694652744134267,
488
+ "epoch": 0.5495537811178958,
489
+ "step": 975
490
+ },
491
+ {
492
+ "loss": 0.5499050521850586,
493
+ "grad_norm": 0.45809435844421387,
494
+ "learning_rate": 2.1138699146794867e-05,
495
+ "entropy": 0.5487177085876465,
496
+ "num_tokens": 37231011.0,
497
+ "mean_token_accuracy": 0.8694357828299204,
498
+ "epoch": 0.5636449037106623,
499
+ "step": 1000
500
+ },
501
+ {
502
+ "eval_loss": 0.5629469752311707,
503
+ "eval_runtime": 34.8051,
504
+ "eval_samples_per_second": 14.366,
505
+ "eval_steps_per_second": 0.919,
506
+ "eval_entropy": 0.5682820733636618,
507
+ "eval_num_tokens": 37231011.0,
508
+ "eval_mean_token_accuracy": 0.8649211004376411,
509
+ "epoch": 0.5636449037106623,
510
+ "step": 1000
511
+ },
512
+ {
513
+ "loss": 0.5626242446899414,
514
+ "grad_norm": 0.4490196108818054,
515
+ "learning_rate": 2.001654654224499e-05,
516
+ "entropy": 0.5606711500883103,
517
+ "num_tokens": 38163978.0,
518
+ "mean_token_accuracy": 0.8663252631823222,
519
+ "epoch": 0.5777360263034288,
520
+ "step": 1025
521
+ },
522
+ {
523
+ "loss": 0.5654045867919922,
524
+ "grad_norm": 0.4703851044178009,
525
+ "learning_rate": 1.8904758920533988e-05,
526
+ "entropy": 0.5644157862663269,
527
+ "num_tokens": 39100488.0,
528
+ "mean_token_accuracy": 0.8655120352904002,
529
+ "epoch": 0.5918271488961954,
530
+ "step": 1050
531
+ },
532
+ {
533
+ "loss": 0.5633118057250976,
534
+ "grad_norm": 0.507513165473938,
535
+ "learning_rate": 1.780564866597872e-05,
536
+ "entropy": 0.5595145153999329,
537
+ "num_tokens": 40015513.0,
538
+ "mean_token_accuracy": 0.8673883573214213,
539
+ "epoch": 0.6059182714889619,
540
+ "step": 1075
541
+ },
542
+ {
543
+ "loss": 0.5580905532836914,
544
+ "grad_norm": 0.48125702142715454,
545
+ "learning_rate": 1.67215017954996e-05,
546
+ "entropy": 0.5561687298615774,
547
+ "num_tokens": 40947677.0,
548
+ "mean_token_accuracy": 0.8681903723875681,
549
+ "epoch": 0.6200093940817285,
550
+ "step": 1100
551
+ },
552
+ {
553
+ "eval_loss": 0.5575993657112122,
554
+ "eval_runtime": 34.8521,
555
+ "eval_samples_per_second": 14.346,
556
+ "eval_steps_per_second": 0.918,
557
+ "eval_entropy": 0.5570412985980511,
558
+ "eval_num_tokens": 40947677.0,
559
+ "eval_mean_token_accuracy": 0.8659888282418251,
560
+ "epoch": 0.6200093940817285,
561
+ "step": 1100
562
+ },
563
+ {
564
+ "loss": 0.5523509979248047,
565
+ "grad_norm": 0.4786842167377472,
566
+ "learning_rate": 1.5654573203980784e-05,
567
+ "entropy": 0.5489772335688273,
568
+ "num_tokens": 41870358.0,
569
+ "mean_token_accuracy": 0.8689925694465637,
570
+ "epoch": 0.6341005166744951,
571
+ "step": 1125
572
+ },
573
+ {
574
+ "loss": 0.5597280883789062,
575
+ "grad_norm": 0.4649102985858917,
576
+ "learning_rate": 1.4607081974360465e-05,
577
+ "entropy": 0.5547034672896067,
578
+ "num_tokens": 42797849.0,
579
+ "mean_token_accuracy": 0.8681511521339417,
580
+ "epoch": 0.6481916392672616,
581
+ "step": 1150
582
+ },
583
+ {
584
+ "loss": 0.5571650695800782,
585
+ "grad_norm": 0.5057896971702576,
586
+ "learning_rate": 1.3581206762205706e-05,
587
+ "entropy": 0.5546683881680171,
588
+ "num_tokens": 43733188.0,
589
+ "mean_token_accuracy": 0.8681851788361867,
590
+ "epoch": 0.6622827618600282,
591
+ "step": 1175
592
+ },
593
+ {
594
+ "loss": 0.5405771255493164,
595
+ "grad_norm": 0.45025017857551575,
596
+ "learning_rate": 1.257908126437129e-05,
597
+ "entropy": 0.5370355778932572,
598
+ "num_tokens": 44647535.0,
599
+ "mean_token_accuracy": 0.8716498986879985,
600
+ "epoch": 0.6763738844527948,
601
+ "step": 1200
602
+ },
603
+ {
604
+ "eval_loss": 0.5538516640663147,
605
+ "eval_runtime": 34.8956,
606
+ "eval_samples_per_second": 14.328,
607
+ "eval_steps_per_second": 0.917,
608
+ "eval_entropy": 0.561651473864913,
609
+ "eval_num_tokens": 44647535.0,
610
+ "eval_mean_token_accuracy": 0.8666044622659683,
611
+ "epoch": 0.6763738844527948,
612
+ "step": 1200
613
+ },
614
+ {
615
+ "loss": 0.5540570831298828,
616
+ "grad_norm": 0.5011326670646667,
617
+ "learning_rate": 1.1602789781167347e-05,
618
+ "entropy": 0.5510254645347595,
619
+ "num_tokens": 45550031.0,
620
+ "mean_token_accuracy": 0.8685724465052287,
621
+ "epoch": 0.6904650070455612,
622
+ "step": 1225
623
+ },
624
+ {
625
+ "loss": 0.5520057296752929,
626
+ "grad_norm": 0.4687948226928711,
627
+ "learning_rate": 1.0654362881265754e-05,
628
+ "entropy": 0.549973030090332,
629
+ "num_tokens": 46479776.0,
630
+ "mean_token_accuracy": 0.8689675887425741,
631
+ "epoch": 0.7045561296383278,
632
+ "step": 1250
633
+ },
634
+ {
635
+ "loss": 0.5608898544311524,
636
+ "grad_norm": 0.5059524178504944,
637
+ "learning_rate": 9.735773178361964e-06,
638
+ "entropy": 0.5597832387685776,
639
+ "num_tokens": 47436308.0,
640
+ "mean_token_accuracy": 0.867308827638626,
641
+ "epoch": 0.7186472522310944,
642
+ "step": 1275
643
+ },
644
+ {
645
+ "loss": 0.5428831100463867,
646
+ "grad_norm": 0.5085554718971252,
647
+ "learning_rate": 8.848931228376136e-06,
648
+ "entropy": 0.5423163912693659,
649
+ "num_tokens": 48366020.0,
650
+ "mean_token_accuracy": 0.8694934193293253,
651
+ "epoch": 0.732738374823861,
652
+ "step": 1300
653
+ },
654
+ {
655
+ "eval_loss": 0.5501593947410583,
656
+ "eval_runtime": 34.8825,
657
+ "eval_samples_per_second": 14.334,
658
+ "eval_steps_per_second": 0.917,
659
+ "eval_entropy": 0.5527484444901347,
660
+ "eval_num_tokens": 48366020.0,
661
+ "eval_mean_token_accuracy": 0.8676421549171209,
662
+ "epoch": 0.732738374823861,
663
+ "step": 1300
664
+ },
665
+ {
666
+ "loss": 0.5539481353759765,
667
+ "grad_norm": 0.5734500288963318,
668
+ "learning_rate": 7.99568155572701e-06,
669
+ "entropy": 0.5485140432914098,
670
+ "num_tokens": 49280534.0,
671
+ "mean_token_accuracy": 0.8694952615102132,
672
+ "epoch": 0.7468294974166275,
673
+ "step": 1325
674
+ },
675
+ {
676
+ "loss": 0.5532180404663086,
677
+ "grad_norm": 0.4714227020740509,
678
+ "learning_rate": 7.177798816943287e-06,
679
+ "entropy": 0.5533179378509522,
680
+ "num_tokens": 50216029.0,
681
+ "mean_token_accuracy": 0.8688394419352213,
682
+ "epoch": 0.7609206200093941,
683
+ "step": 1350
684
+ },
685
+ {
686
+ "loss": 0.5513345336914063,
687
+ "grad_norm": 0.553312361240387,
688
+ "learning_rate": 6.3969841095918445e-06,
689
+ "entropy": 0.5489596172173818,
690
+ "num_tokens": 51157602.0,
691
+ "mean_token_accuracy": 0.8692836586634318,
692
+ "epoch": 0.7750117426021607,
693
+ "step": 1375
694
+ },
695
+ {
696
+ "loss": 0.5497291564941407,
697
+ "grad_norm": 0.4988526999950409,
698
+ "learning_rate": 5.654861434199757e-06,
699
+ "entropy": 0.5469332609574,
700
+ "num_tokens": 52117292.0,
701
+ "mean_token_accuracy": 0.8688764305909474,
702
+ "epoch": 0.7891028651949272,
703
+ "step": 1400
704
+ },
705
+ {
706
+ "eval_loss": 0.5481391549110413,
707
+ "eval_runtime": 34.9193,
708
+ "eval_samples_per_second": 14.319,
709
+ "eval_steps_per_second": 0.916,
710
+ "eval_entropy": 0.5496508749201894,
711
+ "eval_num_tokens": 52117292.0,
712
+ "eval_mean_token_accuracy": 0.8679818995296955,
713
+ "epoch": 0.7891028651949272,
714
+ "step": 1400
715
+ },
716
+ {
717
+ "loss": 0.541988639831543,
718
+ "grad_norm": 0.49807438254356384,
719
+ "learning_rate": 4.952974316528833e-06,
720
+ "entropy": 0.5369369254509608,
721
+ "num_tokens": 53073200.0,
722
+ "mean_token_accuracy": 0.8717625530560812,
723
+ "epoch": 0.8031939877876938,
724
+ "step": 1425
725
+ },
726
+ {
727
+ "loss": 0.5437938308715821,
728
+ "grad_norm": 0.5071395635604858,
729
+ "learning_rate": 4.292782597227962e-06,
730
+ "entropy": 0.5442028508583705,
731
+ "num_tokens": 54003611.0,
732
+ "mean_token_accuracy": 0.8705387047926585,
733
+ "epoch": 0.8172851103804603,
734
+ "step": 1450
735
+ },
736
+ {
737
+ "loss": 0.5272453689575195,
738
+ "grad_norm": 0.4754573404788971,
739
+ "learning_rate": 3.67565939554044e-06,
740
+ "entropy": 0.5260281827052434,
741
+ "num_tokens": 54934829.0,
742
+ "mean_token_accuracy": 0.873881352742513,
743
+ "epoch": 0.8313762329732268,
744
+ "step": 1475
745
+ },
746
+ {
747
+ "loss": 0.5533076095581054,
748
+ "grad_norm": 0.4666975140571594,
749
+ "learning_rate": 3.1028882533813643e-06,
750
+ "entropy": 0.5506138996283213,
751
+ "num_tokens": 55878834.0,
752
+ "mean_token_accuracy": 0.8685371776421865,
753
+ "epoch": 0.8454673555659934,
754
+ "step": 1500
755
+ },
756
+ {
757
+ "eval_loss": 0.5465222001075745,
758
+ "eval_runtime": 34.9157,
759
+ "eval_samples_per_second": 14.32,
760
+ "eval_steps_per_second": 0.916,
761
+ "eval_entropy": 0.5545311672613025,
762
+ "eval_num_tokens": 55878834.0,
763
+ "eval_mean_token_accuracy": 0.8680466562509537,
764
+ "epoch": 0.8454673555659934,
765
+ "step": 1500
766
+ },
767
+ {
768
+ "loss": 0.5680919265747071,
769
+ "grad_norm": 0.4815196990966797,
770
+ "learning_rate": 2.57566046572508e-06,
771
+ "entropy": 0.5671820533275604,
772
+ "num_tokens": 56792774.0,
773
+ "mean_token_accuracy": 0.8665999062856038,
774
+ "epoch": 0.85955847815876,
775
+ "step": 1525
776
+ },
777
+ {
778
+ "loss": 0.5526847839355469,
779
+ "grad_norm": 0.5315864086151123,
780
+ "learning_rate": 2.0950726028551306e-06,
781
+ "entropy": 0.5491122953097025,
782
+ "num_tokens": 57715569.0,
783
+ "mean_token_accuracy": 0.8698907673358918,
784
+ "epoch": 0.8736496007515265,
785
+ "step": 1550
786
+ },
787
+ {
788
+ "loss": 0.5509255599975585,
789
+ "grad_norm": 0.4776453375816345,
790
+ "learning_rate": 1.6621242296301964e-06,
791
+ "entropy": 0.5463390636444092,
792
+ "num_tokens": 58638742.0,
793
+ "mean_token_accuracy": 0.8696528116861979,
794
+ "epoch": 0.8877407233442931,
795
+ "step": 1575
796
+ },
797
+ {
798
+ "loss": 0.5376947021484375,
799
+ "grad_norm": 0.5089407563209534,
800
+ "learning_rate": 1.2777158265095901e-06,
801
+ "entropy": 0.5351915061473846,
802
+ "num_tokens": 59557570.0,
803
+ "mean_token_accuracy": 0.8726084315776825,
804
+ "epoch": 0.9018318459370597,
805
+ "step": 1600
806
+ },
807
+ {
808
+ "eval_loss": 0.5455822944641113,
809
+ "eval_runtime": 34.9436,
810
+ "eval_samples_per_second": 14.309,
811
+ "eval_steps_per_second": 0.916,
812
+ "eval_entropy": 0.5497067291289568,
813
+ "eval_num_tokens": 59557570.0,
814
+ "eval_mean_token_accuracy": 0.8682057596743107,
815
+ "epoch": 0.9018318459370597,
816
+ "step": 1600
817
+ },
818
+ {
819
+ "loss": 0.5281303787231445,
820
+ "grad_norm": 0.49538421630859375,
821
+ "learning_rate": 9.426469166623764e-07,
822
+ "entropy": 0.5247322716315588,
823
+ "num_tokens": 60483216.0,
824
+ "mean_token_accuracy": 0.874461769660314,
825
+ "epoch": 0.9159229685298262,
826
+ "step": 1625
827
+ },
828
+ {
829
+ "loss": 0.5454143524169922,
830
+ "grad_norm": 0.49807706475257874,
831
+ "learning_rate": 6.576144030555259e-07,
832
+ "entropy": 0.5443872211376826,
833
+ "num_tokens": 61433543.0,
834
+ "mean_token_accuracy": 0.8711319859822592,
835
+ "epoch": 0.9300140911225928,
836
+ "step": 1650
837
+ },
838
+ {
839
+ "loss": 0.5425717926025391,
840
+ "grad_norm": 0.4815911650657654,
841
+ "learning_rate": 4.2321111897965784e-07,
842
+ "entropy": 0.5405582892894745,
843
+ "num_tokens": 62383514.0,
844
+ "mean_token_accuracy": 0.8713100798924764,
845
+ "epoch": 0.9441052137153593,
846
+ "step": 1675
847
+ },
848
+ {
849
+ "loss": 0.5433485412597656,
850
+ "grad_norm": 0.7678675055503845,
851
+ "learning_rate": 2.399245950272466e-07,
852
+ "entropy": 0.5402486324310303,
853
+ "num_tokens": 63316188.0,
854
+ "mean_token_accuracy": 0.8717403117815653,
855
+ "epoch": 0.9581963363081258,
856
+ "step": 1700
857
+ },
858
+ {
859
+ "eval_loss": 0.5451184511184692,
860
+ "eval_runtime": 34.9697,
861
+ "eval_samples_per_second": 14.298,
862
+ "eval_steps_per_second": 0.915,
863
+ "eval_entropy": 0.5487884283065796,
864
+ "eval_num_tokens": 63316188.0,
865
+ "eval_mean_token_accuracy": 0.8685821667313576,
866
+ "epoch": 0.9581963363081258,
867
+ "step": 1700
868
+ },
869
+ {
870
+ "loss": 0.5411444473266601,
871
+ "grad_norm": 0.6061132550239563,
872
+ "learning_rate": 1.0813604508771169e-07,
873
+ "entropy": 0.5386804081996281,
874
+ "num_tokens": 64243151.0,
875
+ "mean_token_accuracy": 0.8711121753851573,
876
+ "epoch": 0.9722874589008924,
877
+ "step": 1725
878
+ },
879
+ {
880
+ "loss": 0.5380656433105468,
881
+ "grad_norm": 0.5095033645629883,
882
+ "learning_rate": 2.811957346845473e-08,
883
+ "entropy": 0.5319451389710108,
884
+ "num_tokens": 65152283.0,
885
+ "mean_token_accuracy": 0.8721891554196676,
886
+ "epoch": 0.986378581493659,
887
+ "step": 1750
888
+ },
889
+ {
890
+ "loss": 0.5395606231689453,
891
+ "grad_norm": 5.098243713378906,
892
+ "learning_rate": 4.160479090409286e-11,
893
+ "entropy": 0.5309042213291958,
894
+ "num_tokens": 66051107.0,
895
+ "mean_token_accuracy": 0.8716224238790314,
896
+ "epoch": 1.0,
897
+ "step": 1775
898
+ },
899
+ {
900
+ "train_runtime": 40018.5607,
901
+ "train_samples_per_second": 4.256,
902
+ "train_steps_per_second": 0.044,
903
+ "total_flos": 8.291891439265674e+18,
904
+ "train_loss": 0.6173567452229245,
905
+ "epoch": 1.0,
906
+ "step": 1775
907
+ }
908
+ ]
training_metadata.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "google/gemma-2-9b-it",
3
+ "display_name": "Gemma 2 9B (BF16, Batch16 MaxSafe)",
4
+ "timestamp": "2026-02-04T13:14:38.929340",
5
+ "training_config": {
6
+ "num_train_epochs": 1,
7
+ "per_device_train_batch_size": 16,
8
+ "gradient_accumulation_steps": 6,
9
+ "learning_rate": 5e-05,
10
+ "warmup_ratio": 0.03,
11
+ "lr_scheduler_type": "cosine",
12
+ "weight_decay": 0.01,
13
+ "max_seq_length": 2048,
14
+ "logging_steps": 25,
15
+ "eval_steps": 100,
16
+ "save_steps": 200,
17
+ "seed": 42,
18
+ "bf16": true,
19
+ "optim": "adamw_torch_fused",
20
+ "dataloader_num_workers": 8,
21
+ "torch_compile": false
22
+ },
23
+ "lora_config": {
24
+ "r": 16,
25
+ "lora_alpha": 32,
26
+ "lora_dropout": 0.05,
27
+ "target_modules": [
28
+ "q_proj",
29
+ "k_proj",
30
+ "v_proj",
31
+ "o_proj",
32
+ "gate_proj",
33
+ "up_proj",
34
+ "down_proj"
35
+ ],
36
+ "bias": "none",
37
+ "task_type": "CAUSAL_LM"
38
+ },
39
+ "train_loss": 0.6173567452229245,
40
+ "train_samples": 170305,
41
+ "val_samples": 8965,
42
+ "train_time_minutes": 666.9830995202065,
43
+ "max_memory_gb": 77.72561597824097,
44
+ "fix_applied": "YAML normalization via PyYAML (2 spaces), packing=False, Native BF16 Training, Batch 96"
45
+ }