ChristophSchuhmann commited on
Commit
288f8f3
·
verified ·
1 Parent(s): f04e4e5

Initial upload of checkpoint-4382

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 128000,
8
+ "dtype": "bfloat16",
9
+ "eos_token_id": 128009,
10
+ "head_dim": 128,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 3072,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 8192,
15
+ "max_position_embeddings": 131072,
16
+ "mlp_bias": false,
17
+ "model_type": "llama",
18
+ "num_attention_heads": 24,
19
+ "num_hidden_layers": 28,
20
+ "num_key_value_heads": 8,
21
+ "pad_token_id": 128004,
22
+ "pretraining_tp": 1,
23
+ "rms_norm_eps": 1e-05,
24
+ "rope_scaling": {
25
+ "factor": 32.0,
26
+ "high_freq_factor": 4.0,
27
+ "low_freq_factor": 1.0,
28
+ "original_max_position_embeddings": 8192,
29
+ "rope_type": "llama3"
30
+ },
31
+ "rope_theta": 500000.0,
32
+ "tie_word_embeddings": true,
33
+ "transformers_version": "4.57.2",
34
+ "unsloth_fixed": true,
35
+ "unsloth_version": "2025.11.6",
36
+ "use_cache": true,
37
+ "vocab_size": 156940
38
+ }
generation_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 128000,
4
+ "do_sample": true,
5
+ "eos_token_id": 128009,
6
+ "max_length": 131072,
7
+ "pad_token_id": 128004,
8
+ "temperature": 0.6,
9
+ "top_p": 0.9,
10
+ "transformers_version": "4.57.2"
11
+ }
model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e99351714441c0edfa195b6102f1893b9c636dc957097e4f0ba391329c2a4d20
3
+ size 4991037968
model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:87ae423c3725a5eacac72b1c186f28ed7e950a55b6271af29c960dba35ac4fc5
3
+ size 1610725592
model.safetensors.index.json ADDED
@@ -0,0 +1,262 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_parameters": 3300867072,
4
+ "total_size": 6601734144
5
+ },
6
+ "weight_map": {
7
+ "model.embed_tokens.weight": "model-00001-of-00002.safetensors",
8
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
9
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
10
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
11
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
12
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
13
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
14
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
15
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
16
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
17
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
18
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
19
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
20
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
21
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
22
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
23
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
24
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
25
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
26
+ "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
27
+ "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
28
+ "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
29
+ "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
30
+ "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
31
+ "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
32
+ "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
33
+ "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
34
+ "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
35
+ "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
36
+ "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
37
+ "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
38
+ "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
39
+ "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
40
+ "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
41
+ "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
42
+ "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
43
+ "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
44
+ "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
45
+ "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
46
+ "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
47
+ "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
48
+ "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
49
+ "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
50
+ "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
51
+ "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
52
+ "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
53
+ "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
54
+ "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
55
+ "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
56
+ "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
57
+ "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
58
+ "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
59
+ "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
60
+ "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
61
+ "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
62
+ "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
63
+ "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
64
+ "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
65
+ "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
66
+ "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
67
+ "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
68
+ "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
69
+ "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
70
+ "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
71
+ "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
72
+ "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
73
+ "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
74
+ "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
75
+ "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
76
+ "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
77
+ "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
78
+ "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
79
+ "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
80
+ "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
81
+ "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
82
+ "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
83
+ "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
84
+ "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
85
+ "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
86
+ "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
87
+ "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
88
+ "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
89
+ "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
90
+ "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
91
+ "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
92
+ "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
93
+ "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
94
+ "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
95
+ "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
96
+ "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
97
+ "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
98
+ "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors",
99
+ "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
100
+ "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
101
+ "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
102
+ "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
103
+ "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
104
+ "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
105
+ "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
106
+ "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
107
+ "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors",
108
+ "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
109
+ "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
110
+ "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
111
+ "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
112
+ "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
113
+ "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
114
+ "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
115
+ "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
116
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
117
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
118
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
119
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
120
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
121
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
122
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
123
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
124
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
125
+ "model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors",
126
+ "model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
127
+ "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
128
+ "model.layers.20.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
129
+ "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
130
+ "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
131
+ "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
132
+ "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
133
+ "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
134
+ "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors",
135
+ "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
136
+ "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
137
+ "model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
138
+ "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
139
+ "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
140
+ "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
141
+ "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
142
+ "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
143
+ "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors",
144
+ "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
145
+ "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
146
+ "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
147
+ "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
148
+ "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
149
+ "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
150
+ "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
151
+ "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
152
+ "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors",
153
+ "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
154
+ "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
155
+ "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
156
+ "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
157
+ "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
158
+ "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
159
+ "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
160
+ "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
161
+ "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors",
162
+ "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
163
+ "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
164
+ "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
165
+ "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
166
+ "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
167
+ "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
168
+ "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
169
+ "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
170
+ "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors",
171
+ "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
172
+ "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
173
+ "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
174
+ "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
175
+ "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
176
+ "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
177
+ "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
178
+ "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
179
+ "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors",
180
+ "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
181
+ "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
182
+ "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
183
+ "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
184
+ "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
185
+ "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
186
+ "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
187
+ "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
188
+ "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors",
189
+ "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
190
+ "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
191
+ "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
192
+ "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
193
+ "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
194
+ "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
195
+ "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
196
+ "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
197
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
198
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
199
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
200
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
201
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
202
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
203
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
204
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
205
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
206
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
207
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
208
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
209
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
210
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
211
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
212
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
213
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
214
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
215
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
216
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
217
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
218
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
219
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
220
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
221
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
222
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
223
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
224
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
225
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
226
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
227
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
228
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
229
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
230
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
231
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
232
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
233
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
234
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
235
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
236
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
237
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
238
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
239
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
240
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
241
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
242
+ "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
243
+ "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
244
+ "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
245
+ "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
246
+ "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
247
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
248
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
249
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
250
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
251
+ "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
252
+ "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
253
+ "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
254
+ "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
255
+ "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
256
+ "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
257
+ "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
258
+ "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
259
+ "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
260
+ "model.norm.weight": "model-00002-of-00002.safetensors"
261
+ }
262
+ }
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89a07ea6cbfbafc31f60c871625ae3fa7911fa336e59a361f3518e741182728d
3
+ size 13203690391
rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e25b745caac240acf85f4d993ccdf1355dc409b319795bdaf284ce913f84fcba
3
+ size 16389
rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:55c34b50cbc9ef0ee16f78568ab8cb9f39fda2528fb528565e4c4e6bbef4b5d1
3
+ size 16389
rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e6fd420f2a650afc8853dcf4be7b848033a4553571318be351980db83cb07753
3
+ size 16389
rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f24da66a8bbec7799b070f8ddce5f2e44fd2bdcde052433a0474aadf2b3e3028
3
+ size 16389
rng_state_4.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce4ed0ec5b6b99af767428bd1c70e550c3a87cfcf6b471b0391c3c278894a784
3
+ size 16389
rng_state_5.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:72eb96462bd64bca2c87e7ff1077864a9d49b25c89f2addef49d882400d6839b
3
+ size 16389
rng_state_6.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc83999425f72eeaf8c12a8957680422762a46701bcd05a47249a6fa69666239
3
+ size 16389
rng_state_7.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:77250f2f6a628fe10f48d8e92f5eac168e6eceb0a77a739ad018bcfa675df1a0
3
+ size 16389
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:369e40521235ea74dbfb0c4aba515ec8d7ece6b6b57c25be5b00416758fa43dd
3
+ size 1465
special_tokens_map.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "pad_token": "<|finetune_right_pad_id|>"
3
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc3fecb199b4170636dbfab986d25f628157268d37b861f9cadaca60b1353bce
3
+ size 22849547
tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
trainer_state.json ADDED
@@ -0,0 +1,3132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 626,
3
+ "best_metric": 4.402504920959473,
4
+ "best_model_checkpoint": "/home/deployer/laion/Orpheus-3B-Continued-2E-V4-WithGen/checkpoint-626",
5
+ "epoch": 0.12624531117654175,
6
+ "eval_steps": 313,
7
+ "global_step": 4382,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.00020166982616060984,
14
+ "grad_norm": 0.2099609375,
15
+ "learning_rate": 1.8e-05,
16
+ "loss": 4.9769,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 0.0004033396523212197,
21
+ "grad_norm": 0.1552734375,
22
+ "learning_rate": 3.8e-05,
23
+ "loss": 4.8382,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 0.0006050094784818296,
28
+ "grad_norm": 0.07666015625,
29
+ "learning_rate": 5.8e-05,
30
+ "loss": 4.6796,
31
+ "step": 30
32
+ },
33
+ {
34
+ "epoch": 0.0008066793046424393,
35
+ "grad_norm": 0.06787109375,
36
+ "learning_rate": 7.800000000000001e-05,
37
+ "loss": 4.6557,
38
+ "step": 40
39
+ },
40
+ {
41
+ "epoch": 0.0010083491308030493,
42
+ "grad_norm": 0.058837890625,
43
+ "learning_rate": 9.8e-05,
44
+ "loss": 4.6592,
45
+ "step": 50
46
+ },
47
+ {
48
+ "epoch": 0.0012100189569636591,
49
+ "grad_norm": 0.06982421875,
50
+ "learning_rate": 0.000118,
51
+ "loss": 4.612,
52
+ "step": 60
53
+ },
54
+ {
55
+ "epoch": 0.001411688783124269,
56
+ "grad_norm": 0.06396484375,
57
+ "learning_rate": 0.000138,
58
+ "loss": 4.5519,
59
+ "step": 70
60
+ },
61
+ {
62
+ "epoch": 0.0016133586092848787,
63
+ "grad_norm": 0.06787109375,
64
+ "learning_rate": 0.00015800000000000002,
65
+ "loss": 4.5649,
66
+ "step": 80
67
+ },
68
+ {
69
+ "epoch": 0.0018150284354454887,
70
+ "grad_norm": 0.07177734375,
71
+ "learning_rate": 0.00017800000000000002,
72
+ "loss": 4.5665,
73
+ "step": 90
74
+ },
75
+ {
76
+ "epoch": 0.0020166982616060987,
77
+ "grad_norm": 0.0615234375,
78
+ "learning_rate": 0.00019800000000000002,
79
+ "loss": 4.5854,
80
+ "step": 100
81
+ },
82
+ {
83
+ "epoch": 0.0022183680877667085,
84
+ "grad_norm": 0.06298828125,
85
+ "learning_rate": 0.00019999998367737306,
86
+ "loss": 4.6197,
87
+ "step": 110
88
+ },
89
+ {
90
+ "epoch": 0.0024200379139273183,
91
+ "grad_norm": 0.0693359375,
92
+ "learning_rate": 0.00019999992725348425,
93
+ "loss": 4.5047,
94
+ "step": 120
95
+ },
96
+ {
97
+ "epoch": 0.002621707740087928,
98
+ "grad_norm": 0.0732421875,
99
+ "learning_rate": 0.00019999983052684242,
100
+ "loss": 4.5378,
101
+ "step": 130
102
+ },
103
+ {
104
+ "epoch": 0.002823377566248538,
105
+ "grad_norm": 0.07763671875,
106
+ "learning_rate": 0.0001999996934974865,
107
+ "loss": 4.5702,
108
+ "step": 140
109
+ },
110
+ {
111
+ "epoch": 0.0030250473924091476,
112
+ "grad_norm": 0.08154296875,
113
+ "learning_rate": 0.00019999951616547182,
114
+ "loss": 4.5161,
115
+ "step": 150
116
+ },
117
+ {
118
+ "epoch": 0.0032267172185697574,
119
+ "grad_norm": 0.07177734375,
120
+ "learning_rate": 0.00019999929853086975,
121
+ "loss": 4.5427,
122
+ "step": 160
123
+ },
124
+ {
125
+ "epoch": 0.0034283870447303676,
126
+ "grad_norm": 0.064453125,
127
+ "learning_rate": 0.00019999904059376803,
128
+ "loss": 4.5205,
129
+ "step": 170
130
+ },
131
+ {
132
+ "epoch": 0.0036300568708909774,
133
+ "grad_norm": 0.06298828125,
134
+ "learning_rate": 0.00019999874235427067,
135
+ "loss": 4.5382,
136
+ "step": 180
137
+ },
138
+ {
139
+ "epoch": 0.003831726697051587,
140
+ "grad_norm": 0.0693359375,
141
+ "learning_rate": 0.0001999984038124978,
142
+ "loss": 4.5279,
143
+ "step": 190
144
+ },
145
+ {
146
+ "epoch": 0.004033396523212197,
147
+ "grad_norm": 0.06640625,
148
+ "learning_rate": 0.0001999980249685859,
149
+ "loss": 4.5047,
150
+ "step": 200
151
+ },
152
+ {
153
+ "epoch": 0.004235066349372807,
154
+ "grad_norm": 0.06494140625,
155
+ "learning_rate": 0.00019999760582268763,
156
+ "loss": 4.5041,
157
+ "step": 210
158
+ },
159
+ {
160
+ "epoch": 0.004436736175533417,
161
+ "grad_norm": 0.06689453125,
162
+ "learning_rate": 0.00019999714637497192,
163
+ "loss": 4.5513,
164
+ "step": 220
165
+ },
166
+ {
167
+ "epoch": 0.004638406001694026,
168
+ "grad_norm": 0.0654296875,
169
+ "learning_rate": 0.00019999664662562398,
170
+ "loss": 4.5115,
171
+ "step": 230
172
+ },
173
+ {
174
+ "epoch": 0.0048400758278546365,
175
+ "grad_norm": 0.06396484375,
176
+ "learning_rate": 0.0001999961065748452,
177
+ "loss": 4.5027,
178
+ "step": 240
179
+ },
180
+ {
181
+ "epoch": 0.005041745654015246,
182
+ "grad_norm": 0.07373046875,
183
+ "learning_rate": 0.00019999552622285317,
184
+ "loss": 4.448,
185
+ "step": 250
186
+ },
187
+ {
188
+ "epoch": 0.005243415480175856,
189
+ "grad_norm": 0.06689453125,
190
+ "learning_rate": 0.0001999949055698819,
191
+ "loss": 4.5337,
192
+ "step": 260
193
+ },
194
+ {
195
+ "epoch": 0.005445085306336466,
196
+ "grad_norm": 0.0654296875,
197
+ "learning_rate": 0.00019999424461618145,
198
+ "loss": 4.5358,
199
+ "step": 270
200
+ },
201
+ {
202
+ "epoch": 0.005646755132497076,
203
+ "grad_norm": 0.06103515625,
204
+ "learning_rate": 0.00019999354336201828,
205
+ "loss": 4.5168,
206
+ "step": 280
207
+ },
208
+ {
209
+ "epoch": 0.005848424958657686,
210
+ "grad_norm": 0.0634765625,
211
+ "learning_rate": 0.0001999928018076749,
212
+ "loss": 4.4455,
213
+ "step": 290
214
+ },
215
+ {
216
+ "epoch": 0.006050094784818295,
217
+ "grad_norm": 0.06591796875,
218
+ "learning_rate": 0.00019999201995345026,
219
+ "loss": 4.4747,
220
+ "step": 300
221
+ },
222
+ {
223
+ "epoch": 0.006251764610978905,
224
+ "grad_norm": 0.064453125,
225
+ "learning_rate": 0.00019999119779965947,
226
+ "loss": 4.4948,
227
+ "step": 310
228
+ },
229
+ {
230
+ "epoch": 0.006312265558827088,
231
+ "eval_loss": 4.493932723999023,
232
+ "eval_runtime": 8.8208,
233
+ "eval_samples_per_second": 22.674,
234
+ "eval_steps_per_second": 1.474,
235
+ "step": 313
236
+ },
237
+ {
238
+ "epoch": 0.00028233775662485377,
239
+ "grad_norm": 0.046875,
240
+ "learning_rate": 0.00019996118655688004,
241
+ "loss": 4.4607,
242
+ "step": 320
243
+ },
244
+ {
245
+ "epoch": 0.0006856774089460735,
246
+ "grad_norm": 0.04541015625,
247
+ "learning_rate": 0.00019995756127956854,
248
+ "loss": 4.4336,
249
+ "step": 330
250
+ },
251
+ {
252
+ "epoch": 0.0010890170612672932,
253
+ "grad_norm": 0.046142578125,
254
+ "learning_rate": 0.00019995377420631467,
255
+ "loss": 4.4016,
256
+ "step": 340
257
+ },
258
+ {
259
+ "epoch": 0.001492356713588513,
260
+ "grad_norm": 0.04541015625,
261
+ "learning_rate": 0.00019994982534324835,
262
+ "loss": 4.2976,
263
+ "step": 350
264
+ },
265
+ {
266
+ "epoch": 0.0018956963659097325,
267
+ "grad_norm": 0.05224609375,
268
+ "learning_rate": 0.00019994571469676142,
269
+ "loss": 4.2416,
270
+ "step": 360
271
+ },
272
+ {
273
+ "epoch": 0.0022990360182309523,
274
+ "grad_norm": 0.052734375,
275
+ "learning_rate": 0.00019994144227350756,
276
+ "loss": 4.1895,
277
+ "step": 370
278
+ },
279
+ {
280
+ "epoch": 0.002702375670552172,
281
+ "grad_norm": 0.058349609375,
282
+ "learning_rate": 0.00019993700808040233,
283
+ "loss": 4.1082,
284
+ "step": 380
285
+ },
286
+ {
287
+ "epoch": 0.003105715322873392,
288
+ "grad_norm": 0.057861328125,
289
+ "learning_rate": 0.0001999324121246231,
290
+ "loss": 4.148,
291
+ "step": 390
292
+ },
293
+ {
294
+ "epoch": 0.0035090549751946114,
295
+ "grad_norm": 0.06494140625,
296
+ "learning_rate": 0.00019992765441360905,
297
+ "loss": 4.1184,
298
+ "step": 400
299
+ },
300
+ {
301
+ "epoch": 0.003912394627515831,
302
+ "grad_norm": 0.068359375,
303
+ "learning_rate": 0.00019992273495506133,
304
+ "loss": 4.1018,
305
+ "step": 410
306
+ },
307
+ {
308
+ "epoch": 0.004315734279837051,
309
+ "grad_norm": 0.06005859375,
310
+ "learning_rate": 0.00019991765375694276,
311
+ "loss": 4.0995,
312
+ "step": 420
313
+ },
314
+ {
315
+ "epoch": 0.0047190739321582706,
316
+ "grad_norm": 0.0634765625,
317
+ "learning_rate": 0.00019991241082747795,
318
+ "loss": 4.1194,
319
+ "step": 430
320
+ },
321
+ {
322
+ "epoch": 0.00512241358447949,
323
+ "grad_norm": 0.0634765625,
324
+ "learning_rate": 0.00019990700617515344,
325
+ "loss": 4.0612,
326
+ "step": 440
327
+ },
328
+ {
329
+ "epoch": 0.00552575323680071,
330
+ "grad_norm": 0.06298828125,
331
+ "learning_rate": 0.00019990143980871738,
332
+ "loss": 4.1001,
333
+ "step": 450
334
+ },
335
+ {
336
+ "epoch": 0.005929092889121929,
337
+ "grad_norm": 0.0634765625,
338
+ "learning_rate": 0.00019989571173717975,
339
+ "loss": 4.075,
340
+ "step": 460
341
+ },
342
+ {
343
+ "epoch": 0.00633243254144315,
344
+ "grad_norm": 0.07275390625,
345
+ "learning_rate": 0.00019988982196981233,
346
+ "loss": 4.1117,
347
+ "step": 470
348
+ },
349
+ {
350
+ "epoch": 0.006735772193764369,
351
+ "grad_norm": 0.0517578125,
352
+ "learning_rate": 0.00019988377051614854,
353
+ "loss": 4.5104,
354
+ "step": 480
355
+ },
356
+ {
357
+ "epoch": 0.007139111846085589,
358
+ "grad_norm": 0.047607421875,
359
+ "learning_rate": 0.00019987755738598356,
360
+ "loss": 4.5086,
361
+ "step": 490
362
+ },
363
+ {
364
+ "epoch": 0.007542451498406808,
365
+ "grad_norm": 0.047607421875,
366
+ "learning_rate": 0.00019987118258937416,
367
+ "loss": 4.4517,
368
+ "step": 500
369
+ },
370
+ {
371
+ "epoch": 0.007945791150728028,
372
+ "grad_norm": 0.044677734375,
373
+ "learning_rate": 0.000199864646136639,
374
+ "loss": 4.419,
375
+ "step": 510
376
+ },
377
+ {
378
+ "epoch": 0.008349130803049248,
379
+ "grad_norm": 0.046142578125,
380
+ "learning_rate": 0.00019985794803835825,
381
+ "loss": 4.4749,
382
+ "step": 520
383
+ },
384
+ {
385
+ "epoch": 0.008752470455370467,
386
+ "grad_norm": 0.0478515625,
387
+ "learning_rate": 0.00019985108830537372,
388
+ "loss": 4.4646,
389
+ "step": 530
390
+ },
391
+ {
392
+ "epoch": 0.009155810107691688,
393
+ "grad_norm": 0.04345703125,
394
+ "learning_rate": 0.00019984406694878895,
395
+ "loss": 4.4207,
396
+ "step": 540
397
+ },
398
+ {
399
+ "epoch": 0.009559149760012906,
400
+ "grad_norm": 0.044677734375,
401
+ "learning_rate": 0.00019983688397996898,
402
+ "loss": 4.4308,
403
+ "step": 550
404
+ },
405
+ {
406
+ "epoch": 0.009962489412334127,
407
+ "grad_norm": 0.04345703125,
408
+ "learning_rate": 0.00019982953941054054,
409
+ "loss": 4.4311,
410
+ "step": 560
411
+ },
412
+ {
413
+ "epoch": 0.010365829064655347,
414
+ "grad_norm": 0.04541015625,
415
+ "learning_rate": 0.00019982203325239186,
416
+ "loss": 4.4623,
417
+ "step": 570
418
+ },
419
+ {
420
+ "epoch": 0.010769168716976566,
421
+ "grad_norm": 0.04833984375,
422
+ "learning_rate": 0.00019981436551767275,
423
+ "loss": 4.4461,
424
+ "step": 580
425
+ },
426
+ {
427
+ "epoch": 0.011172508369297786,
428
+ "grad_norm": 0.046630859375,
429
+ "learning_rate": 0.00019980653621879462,
430
+ "loss": 4.4197,
431
+ "step": 590
432
+ },
433
+ {
434
+ "epoch": 0.011575848021619005,
435
+ "grad_norm": 0.047119140625,
436
+ "learning_rate": 0.00019979854536843027,
437
+ "loss": 4.4135,
438
+ "step": 600
439
+ },
440
+ {
441
+ "epoch": 0.011979187673940225,
442
+ "grad_norm": 0.0498046875,
443
+ "learning_rate": 0.0001997903929795141,
444
+ "loss": 4.4105,
445
+ "step": 610
446
+ },
447
+ {
448
+ "epoch": 0.012382527326261444,
449
+ "grad_norm": 0.04638671875,
450
+ "learning_rate": 0.00019978207906524192,
451
+ "loss": 4.4107,
452
+ "step": 620
453
+ },
454
+ {
455
+ "epoch": 0.012624531117654176,
456
+ "eval_loss": 4.402504920959473,
457
+ "eval_runtime": 3.1085,
458
+ "eval_samples_per_second": 64.339,
459
+ "eval_steps_per_second": 8.042,
460
+ "step": 626
461
+ },
462
+ {
463
+ "epoch": 0.00016133586092848787,
464
+ "grad_norm": 0.047119140625,
465
+ "learning_rate": 0.000199773603639071,
466
+ "loss": 4.2779,
467
+ "step": 630
468
+ },
469
+ {
470
+ "epoch": 0.0005646755132497075,
471
+ "grad_norm": 0.0517578125,
472
+ "learning_rate": 0.0001997649667147201,
473
+ "loss": 4.2592,
474
+ "step": 640
475
+ },
476
+ {
477
+ "epoch": 0.0009680151655709273,
478
+ "grad_norm": 0.053466796875,
479
+ "learning_rate": 0.00019975616830616937,
480
+ "loss": 4.2228,
481
+ "step": 650
482
+ },
483
+ {
484
+ "epoch": 0.001371354817892147,
485
+ "grad_norm": 0.055908203125,
486
+ "learning_rate": 0.00019974720842766023,
487
+ "loss": 4.122,
488
+ "step": 660
489
+ },
490
+ {
491
+ "epoch": 0.0017746944702133668,
492
+ "grad_norm": 0.057373046875,
493
+ "learning_rate": 0.00019973808709369565,
494
+ "loss": 4.0099,
495
+ "step": 670
496
+ },
497
+ {
498
+ "epoch": 0.0021780341225345863,
499
+ "grad_norm": 0.0703125,
500
+ "learning_rate": 0.00019972880431903977,
501
+ "loss": 3.9065,
502
+ "step": 680
503
+ },
504
+ {
505
+ "epoch": 0.002581373774855806,
506
+ "grad_norm": 0.06982421875,
507
+ "learning_rate": 0.00019971936011871816,
508
+ "loss": 3.7953,
509
+ "step": 690
510
+ },
511
+ {
512
+ "epoch": 0.002984713427177026,
513
+ "grad_norm": 0.08349609375,
514
+ "learning_rate": 0.00019970975450801762,
515
+ "loss": 3.7788,
516
+ "step": 700
517
+ },
518
+ {
519
+ "epoch": 0.0033880530794982455,
520
+ "grad_norm": 0.08544921875,
521
+ "learning_rate": 0.00019969998750248626,
522
+ "loss": 3.7368,
523
+ "step": 710
524
+ },
525
+ {
526
+ "epoch": 0.003791392731819465,
527
+ "grad_norm": 0.09375,
528
+ "learning_rate": 0.0001996900591179334,
529
+ "loss": 3.7229,
530
+ "step": 720
531
+ },
532
+ {
533
+ "epoch": 0.004194732384140685,
534
+ "grad_norm": 0.095703125,
535
+ "learning_rate": 0.0001996799693704296,
536
+ "loss": 3.7192,
537
+ "step": 730
538
+ },
539
+ {
540
+ "epoch": 0.004598072036461905,
541
+ "grad_norm": 0.09228515625,
542
+ "learning_rate": 0.00019966971827630654,
543
+ "loss": 3.7457,
544
+ "step": 740
545
+ },
546
+ {
547
+ "epoch": 0.005001411688783124,
548
+ "grad_norm": 0.0986328125,
549
+ "learning_rate": 0.00019965930585215714,
550
+ "loss": 3.7031,
551
+ "step": 750
552
+ },
553
+ {
554
+ "epoch": 0.005404751341104344,
555
+ "grad_norm": 0.10595703125,
556
+ "learning_rate": 0.00019964873211483547,
557
+ "loss": 3.7335,
558
+ "step": 760
559
+ },
560
+ {
561
+ "epoch": 0.005808090993425563,
562
+ "grad_norm": 0.10986328125,
563
+ "learning_rate": 0.00019963799708145664,
564
+ "loss": 3.6902,
565
+ "step": 770
566
+ },
567
+ {
568
+ "epoch": 0.006211430645746784,
569
+ "grad_norm": 0.10400390625,
570
+ "learning_rate": 0.00019962710076939686,
571
+ "loss": 3.7408,
572
+ "step": 780
573
+ },
574
+ {
575
+ "epoch": 0.006614770298068003,
576
+ "grad_norm": 0.062255859375,
577
+ "learning_rate": 0.00019961604319629342,
578
+ "loss": 4.204,
579
+ "step": 790
580
+ },
581
+ {
582
+ "epoch": 0.007018109950389223,
583
+ "grad_norm": 0.053955078125,
584
+ "learning_rate": 0.00019960482438004462,
585
+ "loss": 4.364,
586
+ "step": 800
587
+ },
588
+ {
589
+ "epoch": 0.007421449602710442,
590
+ "grad_norm": 0.053955078125,
591
+ "learning_rate": 0.00019959344433880978,
592
+ "loss": 4.3305,
593
+ "step": 810
594
+ },
595
+ {
596
+ "epoch": 0.007824789255031663,
597
+ "grad_norm": 0.04833984375,
598
+ "learning_rate": 0.0001995819030910091,
599
+ "loss": 4.2992,
600
+ "step": 820
601
+ },
602
+ {
603
+ "epoch": 0.008228128907352882,
604
+ "grad_norm": 0.049072265625,
605
+ "learning_rate": 0.00019957020065532386,
606
+ "loss": 4.3347,
607
+ "step": 830
608
+ },
609
+ {
610
+ "epoch": 0.008631468559674102,
611
+ "grad_norm": 0.04931640625,
612
+ "learning_rate": 0.0001995583370506961,
613
+ "loss": 4.321,
614
+ "step": 840
615
+ },
616
+ {
617
+ "epoch": 0.00903480821199532,
618
+ "grad_norm": 0.0517578125,
619
+ "learning_rate": 0.00019954631229632884,
620
+ "loss": 4.3122,
621
+ "step": 850
622
+ },
623
+ {
624
+ "epoch": 0.009438147864316541,
625
+ "grad_norm": 0.05322265625,
626
+ "learning_rate": 0.00019953412641168588,
627
+ "loss": 4.2918,
628
+ "step": 860
629
+ },
630
+ {
631
+ "epoch": 0.00984148751663776,
632
+ "grad_norm": 0.052490234375,
633
+ "learning_rate": 0.00019952177941649185,
634
+ "loss": 4.2793,
635
+ "step": 870
636
+ },
637
+ {
638
+ "epoch": 0.01024482716895898,
639
+ "grad_norm": 0.048828125,
640
+ "learning_rate": 0.00019950927133073222,
641
+ "loss": 4.3363,
642
+ "step": 880
643
+ },
644
+ {
645
+ "epoch": 0.0106481668212802,
646
+ "grad_norm": 0.05126953125,
647
+ "learning_rate": 0.00019949660217465307,
648
+ "loss": 4.311,
649
+ "step": 890
650
+ },
651
+ {
652
+ "epoch": 0.01105150647360142,
653
+ "grad_norm": 0.05126953125,
654
+ "learning_rate": 0.00019948377196876138,
655
+ "loss": 4.2843,
656
+ "step": 900
657
+ },
658
+ {
659
+ "epoch": 0.01145484612592264,
660
+ "grad_norm": 0.053466796875,
661
+ "learning_rate": 0.00019947078073382466,
662
+ "loss": 4.2645,
663
+ "step": 910
664
+ },
665
+ {
666
+ "epoch": 0.011858185778243858,
667
+ "grad_norm": 0.05029296875,
668
+ "learning_rate": 0.00019945762849087113,
669
+ "loss": 4.2695,
670
+ "step": 920
671
+ },
672
+ {
673
+ "epoch": 0.012261525430565079,
674
+ "grad_norm": 0.054931640625,
675
+ "learning_rate": 0.00019944431526118964,
676
+ "loss": 4.279,
677
+ "step": 930
678
+ },
679
+ {
680
+ "epoch": 0.012624531117654176,
681
+ "eval_loss": 4.440088272094727,
682
+ "eval_runtime": 3.1398,
683
+ "eval_samples_per_second": 63.698,
684
+ "eval_steps_per_second": 7.962,
685
+ "step": 939
686
+ },
687
+ {
688
+ "epoch": 4.033396523212197e-05,
689
+ "grad_norm": 0.058837890625,
690
+ "learning_rate": 0.0,
691
+ "loss": 4.0975,
692
+ "step": 940
693
+ },
694
+ {
695
+ "epoch": 0.0004436736175533417,
696
+ "grad_norm": 0.07421875,
697
+ "learning_rate": 2e-05,
698
+ "loss": 4.1075,
699
+ "step": 950
700
+ },
701
+ {
702
+ "epoch": 0.0008470132698745614,
703
+ "grad_norm": 0.0703125,
704
+ "learning_rate": 4e-05,
705
+ "loss": 4.015,
706
+ "step": 960
707
+ },
708
+ {
709
+ "epoch": 0.001250352922195781,
710
+ "grad_norm": 0.07373046875,
711
+ "learning_rate": 6e-05,
712
+ "loss": 3.8564,
713
+ "step": 970
714
+ },
715
+ {
716
+ "epoch": 0.0016536925745170008,
717
+ "grad_norm": 0.1279296875,
718
+ "learning_rate": 8e-05,
719
+ "loss": 3.5457,
720
+ "step": 980
721
+ },
722
+ {
723
+ "epoch": 0.0020570322268382204,
724
+ "grad_norm": 0.1982421875,
725
+ "learning_rate": 0.0001,
726
+ "loss": 3.2341,
727
+ "step": 990
728
+ },
729
+ {
730
+ "epoch": 0.00246037187915944,
731
+ "grad_norm": 0.234375,
732
+ "learning_rate": 0.00012,
733
+ "loss": 3.0956,
734
+ "step": 1000
735
+ },
736
+ {
737
+ "epoch": 0.00286371153148066,
738
+ "grad_norm": 0.27734375,
739
+ "learning_rate": 0.00014,
740
+ "loss": 2.9331,
741
+ "step": 1010
742
+ },
743
+ {
744
+ "epoch": 0.0032670511838018795,
745
+ "grad_norm": 0.337890625,
746
+ "learning_rate": 0.00016,
747
+ "loss": 2.8938,
748
+ "step": 1020
749
+ },
750
+ {
751
+ "epoch": 0.003670390836123099,
752
+ "grad_norm": 0.328125,
753
+ "learning_rate": 0.00018,
754
+ "loss": 2.8145,
755
+ "step": 1030
756
+ },
757
+ {
758
+ "epoch": 0.004073730488444319,
759
+ "grad_norm": 0.296875,
760
+ "learning_rate": 0.0002,
761
+ "loss": 2.9022,
762
+ "step": 1040
763
+ },
764
+ {
765
+ "epoch": 0.004477070140765539,
766
+ "grad_norm": 0.279296875,
767
+ "learning_rate": 0.0001999999190676822,
768
+ "loss": 2.9572,
769
+ "step": 1050
770
+ },
771
+ {
772
+ "epoch": 0.004880409793086758,
773
+ "grad_norm": 0.33203125,
774
+ "learning_rate": 0.00019999967627085973,
775
+ "loss": 2.9381,
776
+ "step": 1060
777
+ },
778
+ {
779
+ "epoch": 0.005283749445407978,
780
+ "grad_norm": 0.2470703125,
781
+ "learning_rate": 0.00019999927160992563,
782
+ "loss": 2.9392,
783
+ "step": 1070
784
+ },
785
+ {
786
+ "epoch": 0.005687089097729197,
787
+ "grad_norm": 0.267578125,
788
+ "learning_rate": 0.00019999870508553488,
789
+ "loss": 2.8675,
790
+ "step": 1080
791
+ },
792
+ {
793
+ "epoch": 0.006090428750050418,
794
+ "grad_norm": 0.279296875,
795
+ "learning_rate": 0.00019999797669860455,
796
+ "loss": 2.9042,
797
+ "step": 1090
798
+ },
799
+ {
800
+ "epoch": 0.006493768402371637,
801
+ "grad_norm": 0.162109375,
802
+ "learning_rate": 0.00019999708645031353,
803
+ "loss": 3.4063,
804
+ "step": 1100
805
+ },
806
+ {
807
+ "epoch": 0.006897108054692857,
808
+ "grad_norm": 0.08349609375,
809
+ "learning_rate": 0.00019999603434210292,
810
+ "loss": 4.137,
811
+ "step": 1110
812
+ },
813
+ {
814
+ "epoch": 0.0073004477070140765,
815
+ "grad_norm": 0.0732421875,
816
+ "learning_rate": 0.00019999482037567565,
817
+ "loss": 4.1305,
818
+ "step": 1120
819
+ },
820
+ {
821
+ "epoch": 0.007703787359335296,
822
+ "grad_norm": 0.06591796875,
823
+ "learning_rate": 0.00019999344455299674,
824
+ "loss": 4.0303,
825
+ "step": 1130
826
+ },
827
+ {
828
+ "epoch": 0.008107127011656516,
829
+ "grad_norm": 0.0634765625,
830
+ "learning_rate": 0.0001999919068762931,
831
+ "loss": 4.0717,
832
+ "step": 1140
833
+ },
834
+ {
835
+ "epoch": 0.008510466663977735,
836
+ "grad_norm": 0.059326171875,
837
+ "learning_rate": 0.00019999020734805373,
838
+ "loss": 4.0664,
839
+ "step": 1150
840
+ },
841
+ {
842
+ "epoch": 0.008913806316298956,
843
+ "grad_norm": 0.060302734375,
844
+ "learning_rate": 0.0001999883459710296,
845
+ "loss": 4.0298,
846
+ "step": 1160
847
+ },
848
+ {
849
+ "epoch": 0.009317145968620174,
850
+ "grad_norm": 0.057861328125,
851
+ "learning_rate": 0.00019998632274823358,
852
+ "loss": 4.0348,
853
+ "step": 1170
854
+ },
855
+ {
856
+ "epoch": 0.009720485620941395,
857
+ "grad_norm": 0.06201171875,
858
+ "learning_rate": 0.00019998413768294052,
859
+ "loss": 4.0192,
860
+ "step": 1180
861
+ },
862
+ {
863
+ "epoch": 0.010123825273262615,
864
+ "grad_norm": 0.064453125,
865
+ "learning_rate": 0.0001999817907786873,
866
+ "loss": 4.033,
867
+ "step": 1190
868
+ },
869
+ {
870
+ "epoch": 0.010527164925583834,
871
+ "grad_norm": 0.0654296875,
872
+ "learning_rate": 0.00019997928203927275,
873
+ "loss": 4.0413,
874
+ "step": 1200
875
+ },
876
+ {
877
+ "epoch": 0.010930504577905054,
878
+ "grad_norm": 0.06884765625,
879
+ "learning_rate": 0.00019997661146875758,
880
+ "loss": 4.0011,
881
+ "step": 1210
882
+ },
883
+ {
884
+ "epoch": 0.011333844230226273,
885
+ "grad_norm": 0.068359375,
886
+ "learning_rate": 0.00019997377907146459,
887
+ "loss": 3.9817,
888
+ "step": 1220
889
+ },
890
+ {
891
+ "epoch": 0.011737183882547493,
892
+ "grad_norm": 0.06884765625,
893
+ "learning_rate": 0.0001999707848519783,
894
+ "loss": 4.0007,
895
+ "step": 1230
896
+ },
897
+ {
898
+ "epoch": 0.012140523534868712,
899
+ "grad_norm": 0.0654296875,
900
+ "learning_rate": 0.0001999676288151454,
901
+ "loss": 4.0265,
902
+ "step": 1240
903
+ },
904
+ {
905
+ "epoch": 0.012543863187189933,
906
+ "grad_norm": 0.060302734375,
907
+ "learning_rate": 0.00019996431096607438,
908
+ "loss": 4.172,
909
+ "step": 1250
910
+ },
911
+ {
912
+ "epoch": 0.012624531117654176,
913
+ "eval_loss": 4.49122428894043,
914
+ "eval_runtime": 3.1912,
915
+ "eval_samples_per_second": 62.672,
916
+ "eval_steps_per_second": 7.834,
917
+ "step": 1252
918
+ },
919
+ {
920
+ "epoch": 0.00032267172185697574,
921
+ "grad_norm": 0.08203125,
922
+ "learning_rate": 1.4000000000000001e-06,
923
+ "loss": 4.1169,
924
+ "step": 1260
925
+ },
926
+ {
927
+ "epoch": 0.0007260113741781954,
928
+ "grad_norm": 0.1220703125,
929
+ "learning_rate": 3.4000000000000005e-06,
930
+ "loss": 4.0418,
931
+ "step": 1270
932
+ },
933
+ {
934
+ "epoch": 0.001129351026499415,
935
+ "grad_norm": 0.1943359375,
936
+ "learning_rate": 5.400000000000001e-06,
937
+ "loss": 3.9211,
938
+ "step": 1280
939
+ },
940
+ {
941
+ "epoch": 0.0015326906788206349,
942
+ "grad_norm": 0.328125,
943
+ "learning_rate": 7.4e-06,
944
+ "loss": 3.6888,
945
+ "step": 1290
946
+ },
947
+ {
948
+ "epoch": 0.0019360303311418546,
949
+ "grad_norm": 0.375,
950
+ "learning_rate": 9.4e-06,
951
+ "loss": 3.4768,
952
+ "step": 1300
953
+ },
954
+ {
955
+ "epoch": 0.0023393699834630744,
956
+ "grad_norm": 0.408203125,
957
+ "learning_rate": 1.14e-05,
958
+ "loss": 3.242,
959
+ "step": 1310
960
+ },
961
+ {
962
+ "epoch": 0.002742709635784294,
963
+ "grad_norm": 0.3671875,
964
+ "learning_rate": 1.3400000000000002e-05,
965
+ "loss": 2.9401,
966
+ "step": 1320
967
+ },
968
+ {
969
+ "epoch": 0.0031460492881055136,
970
+ "grad_norm": 0.265625,
971
+ "learning_rate": 1.54e-05,
972
+ "loss": 2.7321,
973
+ "step": 1330
974
+ },
975
+ {
976
+ "epoch": 0.0035493889404267336,
977
+ "grad_norm": 0.23828125,
978
+ "learning_rate": 1.7400000000000003e-05,
979
+ "loss": 2.413,
980
+ "step": 1340
981
+ },
982
+ {
983
+ "epoch": 0.003952728592747953,
984
+ "grad_norm": 0.2353515625,
985
+ "learning_rate": 1.94e-05,
986
+ "loss": 2.2395,
987
+ "step": 1350
988
+ },
989
+ {
990
+ "epoch": 0.004356068245069173,
991
+ "grad_norm": 0.2353515625,
992
+ "learning_rate": 1.9999999012581816e-05,
993
+ "loss": 2.2062,
994
+ "step": 1360
995
+ },
996
+ {
997
+ "epoch": 0.004759407897390392,
998
+ "grad_norm": 0.232421875,
999
+ "learning_rate": 1.999999417624832e-05,
1000
+ "loss": 2.1174,
1001
+ "step": 1370
1002
+ },
1003
+ {
1004
+ "epoch": 0.005162747549711612,
1005
+ "grad_norm": 0.2265625,
1006
+ "learning_rate": 1.999998530963894e-05,
1007
+ "loss": 1.9498,
1008
+ "step": 1380
1009
+ },
1010
+ {
1011
+ "epoch": 0.005566087202032832,
1012
+ "grad_norm": 0.23046875,
1013
+ "learning_rate": 1.999997241275724e-05,
1014
+ "loss": 1.8648,
1015
+ "step": 1390
1016
+ },
1017
+ {
1018
+ "epoch": 0.005969426854354052,
1019
+ "grad_norm": 0.2392578125,
1020
+ "learning_rate": 1.9999955485608426e-05,
1021
+ "loss": 1.8192,
1022
+ "step": 1400
1023
+ },
1024
+ {
1025
+ "epoch": 0.006372766506675271,
1026
+ "grad_norm": 0.296875,
1027
+ "learning_rate": 1.999993452819932e-05,
1028
+ "loss": 2.0548,
1029
+ "step": 1410
1030
+ },
1031
+ {
1032
+ "epoch": 0.006776106158996491,
1033
+ "grad_norm": 0.34765625,
1034
+ "learning_rate": 1.999990954053836e-05,
1035
+ "loss": 3.5804,
1036
+ "step": 1420
1037
+ },
1038
+ {
1039
+ "epoch": 0.0071794458113177105,
1040
+ "grad_norm": 0.21484375,
1041
+ "learning_rate": 1.9999880522635625e-05,
1042
+ "loss": 3.8265,
1043
+ "step": 1430
1044
+ },
1045
+ {
1046
+ "epoch": 0.00758278546363893,
1047
+ "grad_norm": 0.130859375,
1048
+ "learning_rate": 1.999984747450281e-05,
1049
+ "loss": 3.7124,
1050
+ "step": 1440
1051
+ },
1052
+ {
1053
+ "epoch": 0.00798612511596015,
1054
+ "grad_norm": 0.1142578125,
1055
+ "learning_rate": 1.9999810396153232e-05,
1056
+ "loss": 3.7061,
1057
+ "step": 1450
1058
+ },
1059
+ {
1060
+ "epoch": 0.00838946476828137,
1061
+ "grad_norm": 0.103515625,
1062
+ "learning_rate": 1.9999769287601834e-05,
1063
+ "loss": 3.68,
1064
+ "step": 1460
1065
+ },
1066
+ {
1067
+ "epoch": 0.008792804420602589,
1068
+ "grad_norm": 0.09814453125,
1069
+ "learning_rate": 1.9999724148865183e-05,
1070
+ "loss": 3.6099,
1071
+ "step": 1470
1072
+ },
1073
+ {
1074
+ "epoch": 0.00919614407292381,
1075
+ "grad_norm": 0.09423828125,
1076
+ "learning_rate": 1.9999674979961473e-05,
1077
+ "loss": 3.613,
1078
+ "step": 1480
1079
+ },
1080
+ {
1081
+ "epoch": 0.00959948372524503,
1082
+ "grad_norm": 0.0966796875,
1083
+ "learning_rate": 1.999962178091052e-05,
1084
+ "loss": 3.5696,
1085
+ "step": 1490
1086
+ },
1087
+ {
1088
+ "epoch": 0.010002823377566248,
1089
+ "grad_norm": 0.10986328125,
1090
+ "learning_rate": 1.9999564551733764e-05,
1091
+ "loss": 3.5333,
1092
+ "step": 1500
1093
+ },
1094
+ {
1095
+ "epoch": 0.010406163029887469,
1096
+ "grad_norm": 0.09375,
1097
+ "learning_rate": 1.9999503292454275e-05,
1098
+ "loss": 3.5119,
1099
+ "step": 1510
1100
+ },
1101
+ {
1102
+ "epoch": 0.010809502682208687,
1103
+ "grad_norm": 0.103515625,
1104
+ "learning_rate": 1.9999438003096733e-05,
1105
+ "loss": 3.403,
1106
+ "step": 1520
1107
+ },
1108
+ {
1109
+ "epoch": 0.011212842334529908,
1110
+ "grad_norm": 0.10546875,
1111
+ "learning_rate": 1.9999368683687457e-05,
1112
+ "loss": 3.3882,
1113
+ "step": 1530
1114
+ },
1115
+ {
1116
+ "epoch": 0.011616181986851127,
1117
+ "grad_norm": 0.0986328125,
1118
+ "learning_rate": 1.999929533425439e-05,
1119
+ "loss": 3.3576,
1120
+ "step": 1540
1121
+ },
1122
+ {
1123
+ "epoch": 0.012019521639172347,
1124
+ "grad_norm": 0.107421875,
1125
+ "learning_rate": 1.999921795482708e-05,
1126
+ "loss": 3.4566,
1127
+ "step": 1550
1128
+ },
1129
+ {
1130
+ "epoch": 0.012422861291493567,
1131
+ "grad_norm": 0.158203125,
1132
+ "learning_rate": 1.9999136545436727e-05,
1133
+ "loss": 3.7767,
1134
+ "step": 1560
1135
+ },
1136
+ {
1137
+ "epoch": 0.012826200943814786,
1138
+ "grad_norm": 0.2177734375,
1139
+ "learning_rate": 1.999905110611613e-05,
1140
+ "loss": 4.5098,
1141
+ "step": 1570
1142
+ },
1143
+ {
1144
+ "epoch": 0.013229540596136007,
1145
+ "grad_norm": 0.1552734375,
1146
+ "learning_rate": 1.9998961636899736e-05,
1147
+ "loss": 4.6336,
1148
+ "step": 1580
1149
+ },
1150
+ {
1151
+ "epoch": 0.013632880248457225,
1152
+ "grad_norm": 0.12255859375,
1153
+ "learning_rate": 1.999886813782359e-05,
1154
+ "loss": 4.5697,
1155
+ "step": 1590
1156
+ },
1157
+ {
1158
+ "epoch": 0.014036219900778446,
1159
+ "grad_norm": 0.10498046875,
1160
+ "learning_rate": 1.999877060892538e-05,
1161
+ "loss": 4.5598,
1162
+ "step": 1600
1163
+ },
1164
+ {
1165
+ "epoch": 0.014439559553099664,
1166
+ "grad_norm": 0.08984375,
1167
+ "learning_rate": 1.9998669050244416e-05,
1168
+ "loss": 4.5326,
1169
+ "step": 1610
1170
+ },
1171
+ {
1172
+ "epoch": 0.014842899205420885,
1173
+ "grad_norm": 0.07958984375,
1174
+ "learning_rate": 1.999856346182163e-05,
1175
+ "loss": 4.5219,
1176
+ "step": 1620
1177
+ },
1178
+ {
1179
+ "epoch": 0.015246238857742105,
1180
+ "grad_norm": 0.07177734375,
1181
+ "learning_rate": 1.999845384369957e-05,
1182
+ "loss": 4.5013,
1183
+ "step": 1630
1184
+ },
1185
+ {
1186
+ "epoch": 0.015649578510063326,
1187
+ "grad_norm": 0.07080078125,
1188
+ "learning_rate": 1.9998340195922418e-05,
1189
+ "loss": 4.4859,
1190
+ "step": 1640
1191
+ },
1192
+ {
1193
+ "epoch": 0.016052918162384543,
1194
+ "grad_norm": 0.061767578125,
1195
+ "learning_rate": 1.999822251853598e-05,
1196
+ "loss": 4.5328,
1197
+ "step": 1650
1198
+ },
1199
+ {
1200
+ "epoch": 0.016456257814705763,
1201
+ "grad_norm": 0.05859375,
1202
+ "learning_rate": 1.9998100811587686e-05,
1203
+ "loss": 4.5024,
1204
+ "step": 1660
1205
+ },
1206
+ {
1207
+ "epoch": 0.016859597467026984,
1208
+ "grad_norm": 0.06201171875,
1209
+ "learning_rate": 1.9997975075126573e-05,
1210
+ "loss": 4.5178,
1211
+ "step": 1670
1212
+ },
1213
+ {
1214
+ "epoch": 0.017262937119348204,
1215
+ "grad_norm": 0.05908203125,
1216
+ "learning_rate": 1.9997845309203333e-05,
1217
+ "loss": 4.4892,
1218
+ "step": 1680
1219
+ },
1220
+ {
1221
+ "epoch": 0.017666276771669424,
1222
+ "grad_norm": 0.055908203125,
1223
+ "learning_rate": 1.9997711513870257e-05,
1224
+ "loss": 4.4645,
1225
+ "step": 1690
1226
+ },
1227
+ {
1228
+ "epoch": 0.01806961642399064,
1229
+ "grad_norm": 0.0546875,
1230
+ "learning_rate": 1.9997573689181272e-05,
1231
+ "loss": 4.4891,
1232
+ "step": 1700
1233
+ },
1234
+ {
1235
+ "epoch": 0.018472956076311862,
1236
+ "grad_norm": 0.054443359375,
1237
+ "learning_rate": 1.999743183519192e-05,
1238
+ "loss": 4.4604,
1239
+ "step": 1710
1240
+ },
1241
+ {
1242
+ "epoch": 0.018876295728633082,
1243
+ "grad_norm": 0.05224609375,
1244
+ "learning_rate": 1.9997285951959372e-05,
1245
+ "loss": 4.4935,
1246
+ "step": 1720
1247
+ },
1248
+ {
1249
+ "epoch": 0.019279635380954303,
1250
+ "grad_norm": 0.052978515625,
1251
+ "learning_rate": 1.999713603954243e-05,
1252
+ "loss": 4.4723,
1253
+ "step": 1730
1254
+ },
1255
+ {
1256
+ "epoch": 0.01968297503327552,
1257
+ "grad_norm": 0.052734375,
1258
+ "learning_rate": 1.9996982098001508e-05,
1259
+ "loss": 4.4923,
1260
+ "step": 1740
1261
+ },
1262
+ {
1263
+ "epoch": 0.02008631468559674,
1264
+ "grad_norm": 0.050537109375,
1265
+ "learning_rate": 1.9996824127398648e-05,
1266
+ "loss": 4.4402,
1267
+ "step": 1750
1268
+ },
1269
+ {
1270
+ "epoch": 0.02048965433791796,
1271
+ "grad_norm": 0.05078125,
1272
+ "learning_rate": 1.999666212779752e-05,
1273
+ "loss": 4.4716,
1274
+ "step": 1760
1275
+ },
1276
+ {
1277
+ "epoch": 0.02089299399023918,
1278
+ "grad_norm": 0.0478515625,
1279
+ "learning_rate": 1.999649609926341e-05,
1280
+ "loss": 4.4952,
1281
+ "step": 1770
1282
+ },
1283
+ {
1284
+ "epoch": 0.0212963336425604,
1285
+ "grad_norm": 0.052490234375,
1286
+ "learning_rate": 1.9996326041863236e-05,
1287
+ "loss": 4.4755,
1288
+ "step": 1780
1289
+ },
1290
+ {
1291
+ "epoch": 0.02169967329488162,
1292
+ "grad_norm": 0.04931640625,
1293
+ "learning_rate": 1.9996151955665535e-05,
1294
+ "loss": 4.4645,
1295
+ "step": 1790
1296
+ },
1297
+ {
1298
+ "epoch": 0.02210301294720284,
1299
+ "grad_norm": 0.05029296875,
1300
+ "learning_rate": 1.9995973840740467e-05,
1301
+ "loss": 4.474,
1302
+ "step": 1800
1303
+ },
1304
+ {
1305
+ "epoch": 0.02250635259952406,
1306
+ "grad_norm": 0.05029296875,
1307
+ "learning_rate": 1.999579169715982e-05,
1308
+ "loss": 4.4614,
1309
+ "step": 1810
1310
+ },
1311
+ {
1312
+ "epoch": 0.02290969225184528,
1313
+ "grad_norm": 0.049560546875,
1314
+ "learning_rate": 1.9995605524996996e-05,
1315
+ "loss": 4.4622,
1316
+ "step": 1820
1317
+ },
1318
+ {
1319
+ "epoch": 0.0233130319041665,
1320
+ "grad_norm": 0.052490234375,
1321
+ "learning_rate": 1.9995415324327038e-05,
1322
+ "loss": 4.4785,
1323
+ "step": 1830
1324
+ },
1325
+ {
1326
+ "epoch": 0.023716371556487717,
1327
+ "grad_norm": 0.048828125,
1328
+ "learning_rate": 1.9995221095226597e-05,
1329
+ "loss": 4.5104,
1330
+ "step": 1840
1331
+ },
1332
+ {
1333
+ "epoch": 0.024119711208808937,
1334
+ "grad_norm": 0.048095703125,
1335
+ "learning_rate": 1.999502283777395e-05,
1336
+ "loss": 4.4759,
1337
+ "step": 1850
1338
+ },
1339
+ {
1340
+ "epoch": 0.024523050861130158,
1341
+ "grad_norm": 0.0458984375,
1342
+ "learning_rate": 1.9994820552049002e-05,
1343
+ "loss": 4.4359,
1344
+ "step": 1860
1345
+ },
1346
+ {
1347
+ "epoch": 0.02492639051345138,
1348
+ "grad_norm": 0.045166015625,
1349
+ "learning_rate": 1.9994614238133282e-05,
1350
+ "loss": 4.4895,
1351
+ "step": 1870
1352
+ },
1353
+ {
1354
+ "epoch": 0.0253297301657726,
1355
+ "grad_norm": 0.04541015625,
1356
+ "learning_rate": 1.9994403896109942e-05,
1357
+ "loss": 4.4755,
1358
+ "step": 1880
1359
+ },
1360
+ {
1361
+ "epoch": 0.025733069818093816,
1362
+ "grad_norm": 0.045654296875,
1363
+ "learning_rate": 1.9994189526063746e-05,
1364
+ "loss": 4.4661,
1365
+ "step": 1890
1366
+ },
1367
+ {
1368
+ "epoch": 0.026136409470415036,
1369
+ "grad_norm": 0.046630859375,
1370
+ "learning_rate": 1.99939711280811e-05,
1371
+ "loss": 4.4775,
1372
+ "step": 1900
1373
+ },
1374
+ {
1375
+ "epoch": 0.026539749122736257,
1376
+ "grad_norm": 0.04931640625,
1377
+ "learning_rate": 1.999374870225003e-05,
1378
+ "loss": 4.4954,
1379
+ "step": 1910
1380
+ },
1381
+ {
1382
+ "epoch": 0.026943088775057477,
1383
+ "grad_norm": 0.046142578125,
1384
+ "learning_rate": 1.9993522248660163e-05,
1385
+ "loss": 4.4905,
1386
+ "step": 1920
1387
+ },
1388
+ {
1389
+ "epoch": 0.027346428427378694,
1390
+ "grad_norm": 0.049072265625,
1391
+ "learning_rate": 1.9993291767402776e-05,
1392
+ "loss": 4.4545,
1393
+ "step": 1930
1394
+ },
1395
+ {
1396
+ "epoch": 0.027749768079699914,
1397
+ "grad_norm": 0.045166015625,
1398
+ "learning_rate": 1.9993057258570762e-05,
1399
+ "loss": 4.4737,
1400
+ "step": 1940
1401
+ },
1402
+ {
1403
+ "epoch": 0.028153107732021135,
1404
+ "grad_norm": 0.046630859375,
1405
+ "learning_rate": 1.9992818722258626e-05,
1406
+ "loss": 4.4908,
1407
+ "step": 1950
1408
+ },
1409
+ {
1410
+ "epoch": 0.028556447384342355,
1411
+ "grad_norm": 0.047607421875,
1412
+ "learning_rate": 1.9992576158562515e-05,
1413
+ "loss": 4.4893,
1414
+ "step": 1960
1415
+ },
1416
+ {
1417
+ "epoch": 0.028959787036663576,
1418
+ "grad_norm": 0.04736328125,
1419
+ "learning_rate": 1.999232956758018e-05,
1420
+ "loss": 4.4452,
1421
+ "step": 1970
1422
+ },
1423
+ {
1424
+ "epoch": 0.029363126688984793,
1425
+ "grad_norm": 0.044677734375,
1426
+ "learning_rate": 1.999207894941101e-05,
1427
+ "loss": 4.4901,
1428
+ "step": 1980
1429
+ },
1430
+ {
1431
+ "epoch": 0.029766466341306013,
1432
+ "grad_norm": 0.0478515625,
1433
+ "learning_rate": 1.9991824304156006e-05,
1434
+ "loss": 4.419,
1435
+ "step": 1990
1436
+ },
1437
+ {
1438
+ "epoch": 0.030169805993627234,
1439
+ "grad_norm": 0.048828125,
1440
+ "learning_rate": 1.99915656319178e-05,
1441
+ "loss": 4.4521,
1442
+ "step": 2000
1443
+ },
1444
+ {
1445
+ "epoch": 0.030573145645948454,
1446
+ "grad_norm": 0.046875,
1447
+ "learning_rate": 1.999130293280065e-05,
1448
+ "loss": 4.4561,
1449
+ "step": 2010
1450
+ },
1451
+ {
1452
+ "epoch": 0.030976485298269674,
1453
+ "grad_norm": 0.0595703125,
1454
+ "learning_rate": 1.9991036206910417e-05,
1455
+ "loss": 4.4544,
1456
+ "step": 2020
1457
+ },
1458
+ {
1459
+ "epoch": 0.03137982495059089,
1460
+ "grad_norm": 0.048095703125,
1461
+ "learning_rate": 1.999076545435461e-05,
1462
+ "loss": 4.4713,
1463
+ "step": 2030
1464
+ },
1465
+ {
1466
+ "epoch": 0.03178316460291211,
1467
+ "grad_norm": 0.047119140625,
1468
+ "learning_rate": 1.999049067524235e-05,
1469
+ "loss": 4.4408,
1470
+ "step": 2040
1471
+ },
1472
+ {
1473
+ "epoch": 0.03218650425523333,
1474
+ "grad_norm": 0.04541015625,
1475
+ "learning_rate": 1.9990211869684374e-05,
1476
+ "loss": 4.4686,
1477
+ "step": 2050
1478
+ },
1479
+ {
1480
+ "epoch": 0.03258984390755455,
1481
+ "grad_norm": 0.047119140625,
1482
+ "learning_rate": 1.998992903779305e-05,
1483
+ "loss": 4.4673,
1484
+ "step": 2060
1485
+ },
1486
+ {
1487
+ "epoch": 0.03299318355987577,
1488
+ "grad_norm": 0.046630859375,
1489
+ "learning_rate": 1.9989642179682374e-05,
1490
+ "loss": 4.4302,
1491
+ "step": 2070
1492
+ },
1493
+ {
1494
+ "epoch": 0.033396523212196993,
1495
+ "grad_norm": 0.046875,
1496
+ "learning_rate": 1.998935129546795e-05,
1497
+ "loss": 4.4844,
1498
+ "step": 2080
1499
+ },
1500
+ {
1501
+ "epoch": 0.033799862864518214,
1502
+ "grad_norm": 0.0439453125,
1503
+ "learning_rate": 1.9989056385267015e-05,
1504
+ "loss": 4.4739,
1505
+ "step": 2090
1506
+ },
1507
+ {
1508
+ "epoch": 0.03420320251683943,
1509
+ "grad_norm": 0.047119140625,
1510
+ "learning_rate": 1.9988757449198428e-05,
1511
+ "loss": 4.499,
1512
+ "step": 2100
1513
+ },
1514
+ {
1515
+ "epoch": 0.03460654216916065,
1516
+ "grad_norm": 0.046875,
1517
+ "learning_rate": 1.9988454487382667e-05,
1518
+ "loss": 4.4517,
1519
+ "step": 2110
1520
+ },
1521
+ {
1522
+ "epoch": 0.03500988182148187,
1523
+ "grad_norm": 0.044677734375,
1524
+ "learning_rate": 1.9988147499941832e-05,
1525
+ "loss": 4.4608,
1526
+ "step": 2120
1527
+ },
1528
+ {
1529
+ "epoch": 0.03541322147380309,
1530
+ "grad_norm": 0.0458984375,
1531
+ "learning_rate": 1.998783648699965e-05,
1532
+ "loss": 4.4515,
1533
+ "step": 2130
1534
+ },
1535
+ {
1536
+ "epoch": 0.03581656112612431,
1537
+ "grad_norm": 0.04443359375,
1538
+ "learning_rate": 1.9987521448681465e-05,
1539
+ "loss": 4.4646,
1540
+ "step": 2140
1541
+ },
1542
+ {
1543
+ "epoch": 0.03621990077844553,
1544
+ "grad_norm": 0.05908203125,
1545
+ "learning_rate": 1.9987202385114252e-05,
1546
+ "loss": 4.4586,
1547
+ "step": 2150
1548
+ },
1549
+ {
1550
+ "epoch": 0.03662324043076675,
1551
+ "grad_norm": 0.04736328125,
1552
+ "learning_rate": 1.99868792964266e-05,
1553
+ "loss": 4.4699,
1554
+ "step": 2160
1555
+ },
1556
+ {
1557
+ "epoch": 0.03702658008308797,
1558
+ "grad_norm": 0.044189453125,
1559
+ "learning_rate": 1.9986552182748715e-05,
1560
+ "loss": 4.4678,
1561
+ "step": 2170
1562
+ },
1563
+ {
1564
+ "epoch": 0.03742991973540919,
1565
+ "grad_norm": 0.04541015625,
1566
+ "learning_rate": 1.9986221044212442e-05,
1567
+ "loss": 4.4581,
1568
+ "step": 2180
1569
+ },
1570
+ {
1571
+ "epoch": 0.037833259387730404,
1572
+ "grad_norm": 0.045654296875,
1573
+ "learning_rate": 1.998588588095124e-05,
1574
+ "loss": 4.4273,
1575
+ "step": 2190
1576
+ },
1577
+ {
1578
+ "epoch": 0.038236599040051625,
1579
+ "grad_norm": 0.044677734375,
1580
+ "learning_rate": 1.9985546693100186e-05,
1581
+ "loss": 4.4602,
1582
+ "step": 2200
1583
+ },
1584
+ {
1585
+ "epoch": 0.038639938692372845,
1586
+ "grad_norm": 0.04931640625,
1587
+ "learning_rate": 1.9985203480795977e-05,
1588
+ "loss": 4.4766,
1589
+ "step": 2210
1590
+ },
1591
+ {
1592
+ "epoch": 0.039043278344694066,
1593
+ "grad_norm": 0.045166015625,
1594
+ "learning_rate": 1.9984856244176948e-05,
1595
+ "loss": 4.4579,
1596
+ "step": 2220
1597
+ },
1598
+ {
1599
+ "epoch": 0.039446617997015286,
1600
+ "grad_norm": 0.04638671875,
1601
+ "learning_rate": 1.998450498338303e-05,
1602
+ "loss": 4.4661,
1603
+ "step": 2230
1604
+ },
1605
+ {
1606
+ "epoch": 0.039849957649336507,
1607
+ "grad_norm": 0.043701171875,
1608
+ "learning_rate": 1.9984149698555808e-05,
1609
+ "loss": 4.4415,
1610
+ "step": 2240
1611
+ },
1612
+ {
1613
+ "epoch": 0.04025329730165773,
1614
+ "grad_norm": 0.046875,
1615
+ "learning_rate": 1.998379038983846e-05,
1616
+ "loss": 4.4461,
1617
+ "step": 2250
1618
+ },
1619
+ {
1620
+ "epoch": 0.04065663695397895,
1621
+ "grad_norm": 0.04443359375,
1622
+ "learning_rate": 1.9983427057375802e-05,
1623
+ "loss": 4.4627,
1624
+ "step": 2260
1625
+ },
1626
+ {
1627
+ "epoch": 0.04105997660630017,
1628
+ "grad_norm": 0.0458984375,
1629
+ "learning_rate": 1.9983059701314267e-05,
1630
+ "loss": 4.4399,
1631
+ "step": 2270
1632
+ },
1633
+ {
1634
+ "epoch": 0.04146331625862139,
1635
+ "grad_norm": 0.045654296875,
1636
+ "learning_rate": 1.9982688321801906e-05,
1637
+ "loss": 4.4673,
1638
+ "step": 2280
1639
+ },
1640
+ {
1641
+ "epoch": 0.0418666559109426,
1642
+ "grad_norm": 0.046142578125,
1643
+ "learning_rate": 1.99823129189884e-05,
1644
+ "loss": 4.478,
1645
+ "step": 2290
1646
+ },
1647
+ {
1648
+ "epoch": 0.04226999556326382,
1649
+ "grad_norm": 0.045166015625,
1650
+ "learning_rate": 1.9981933493025044e-05,
1651
+ "loss": 4.4433,
1652
+ "step": 2300
1653
+ },
1654
+ {
1655
+ "epoch": 0.04267333521558504,
1656
+ "grad_norm": 0.045166015625,
1657
+ "learning_rate": 1.9981550044064756e-05,
1658
+ "loss": 4.4535,
1659
+ "step": 2310
1660
+ },
1661
+ {
1662
+ "epoch": 0.04307667486790626,
1663
+ "grad_norm": 0.046142578125,
1664
+ "learning_rate": 1.998116257226208e-05,
1665
+ "loss": 4.4665,
1666
+ "step": 2320
1667
+ },
1668
+ {
1669
+ "epoch": 0.043480014520227483,
1670
+ "grad_norm": 0.045166015625,
1671
+ "learning_rate": 1.9980771077773177e-05,
1672
+ "loss": 4.4419,
1673
+ "step": 2330
1674
+ },
1675
+ {
1676
+ "epoch": 0.043883354172548704,
1677
+ "grad_norm": 0.046875,
1678
+ "learning_rate": 1.9980375560755833e-05,
1679
+ "loss": 4.4724,
1680
+ "step": 2340
1681
+ },
1682
+ {
1683
+ "epoch": 0.044286693824869924,
1684
+ "grad_norm": 0.044921875,
1685
+ "learning_rate": 1.997997602136944e-05,
1686
+ "loss": 4.4485,
1687
+ "step": 2350
1688
+ },
1689
+ {
1690
+ "epoch": 0.044690033477191145,
1691
+ "grad_norm": 0.044921875,
1692
+ "learning_rate": 1.997957245977504e-05,
1693
+ "loss": 4.4484,
1694
+ "step": 2360
1695
+ },
1696
+ {
1697
+ "epoch": 0.045093373129512365,
1698
+ "grad_norm": 0.044921875,
1699
+ "learning_rate": 1.997916487613527e-05,
1700
+ "loss": 4.4594,
1701
+ "step": 2370
1702
+ },
1703
+ {
1704
+ "epoch": 0.04549671278183358,
1705
+ "grad_norm": 0.04638671875,
1706
+ "learning_rate": 1.9978753270614403e-05,
1707
+ "loss": 4.4836,
1708
+ "step": 2380
1709
+ },
1710
+ {
1711
+ "epoch": 0.0459000524341548,
1712
+ "grad_norm": 0.044921875,
1713
+ "learning_rate": 1.997833764337832e-05,
1714
+ "loss": 4.4767,
1715
+ "step": 2390
1716
+ },
1717
+ {
1718
+ "epoch": 0.04630339208647602,
1719
+ "grad_norm": 0.04736328125,
1720
+ "learning_rate": 1.9977917994594537e-05,
1721
+ "loss": 4.4358,
1722
+ "step": 2400
1723
+ },
1724
+ {
1725
+ "epoch": 0.04670673173879724,
1726
+ "grad_norm": 0.04541015625,
1727
+ "learning_rate": 1.997749432443218e-05,
1728
+ "loss": 4.4552,
1729
+ "step": 2410
1730
+ },
1731
+ {
1732
+ "epoch": 0.04711007139111846,
1733
+ "grad_norm": 0.044677734375,
1734
+ "learning_rate": 1.9977066633062002e-05,
1735
+ "loss": 4.4383,
1736
+ "step": 2420
1737
+ },
1738
+ {
1739
+ "epoch": 0.04751341104343968,
1740
+ "grad_norm": 0.045654296875,
1741
+ "learning_rate": 1.9976634920656374e-05,
1742
+ "loss": 4.4605,
1743
+ "step": 2430
1744
+ },
1745
+ {
1746
+ "epoch": 0.0479167506957609,
1747
+ "grad_norm": 0.042724609375,
1748
+ "learning_rate": 1.9976199187389286e-05,
1749
+ "loss": 4.4496,
1750
+ "step": 2440
1751
+ },
1752
+ {
1753
+ "epoch": 0.04832009034808212,
1754
+ "grad_norm": 0.04638671875,
1755
+ "learning_rate": 1.997575943343635e-05,
1756
+ "loss": 4.454,
1757
+ "step": 2450
1758
+ },
1759
+ {
1760
+ "epoch": 0.04872343000040334,
1761
+ "grad_norm": 0.044189453125,
1762
+ "learning_rate": 1.997531565897481e-05,
1763
+ "loss": 4.4212,
1764
+ "step": 2460
1765
+ },
1766
+ {
1767
+ "epoch": 0.04912676965272456,
1768
+ "grad_norm": 0.043212890625,
1769
+ "learning_rate": 1.9974867864183508e-05,
1770
+ "loss": 4.4441,
1771
+ "step": 2470
1772
+ },
1773
+ {
1774
+ "epoch": 0.049530109305045776,
1775
+ "grad_norm": 0.04541015625,
1776
+ "learning_rate": 1.997441604924292e-05,
1777
+ "loss": 4.4085,
1778
+ "step": 2480
1779
+ },
1780
+ {
1781
+ "epoch": 0.049933448957366997,
1782
+ "grad_norm": 0.044189453125,
1783
+ "learning_rate": 1.997396021433514e-05,
1784
+ "loss": 4.4616,
1785
+ "step": 2490
1786
+ },
1787
+ {
1788
+ "epoch": 0.05033678860968822,
1789
+ "grad_norm": 0.04833984375,
1790
+ "learning_rate": 1.9973500359643885e-05,
1791
+ "loss": 4.4544,
1792
+ "step": 2500
1793
+ },
1794
+ {
1795
+ "epoch": 0.05074012826200944,
1796
+ "grad_norm": 0.044677734375,
1797
+ "learning_rate": 1.9973036485354485e-05,
1798
+ "loss": 4.453,
1799
+ "step": 2510
1800
+ },
1801
+ {
1802
+ "epoch": 0.05114346791433066,
1803
+ "grad_norm": 0.04638671875,
1804
+ "learning_rate": 1.99725685916539e-05,
1805
+ "loss": 4.4722,
1806
+ "step": 2520
1807
+ },
1808
+ {
1809
+ "epoch": 0.05154680756665188,
1810
+ "grad_norm": 0.04296875,
1811
+ "learning_rate": 1.99720966787307e-05,
1812
+ "loss": 4.4332,
1813
+ "step": 2530
1814
+ },
1815
+ {
1816
+ "epoch": 0.0519501472189731,
1817
+ "grad_norm": 0.04638671875,
1818
+ "learning_rate": 1.9971620746775077e-05,
1819
+ "loss": 4.4757,
1820
+ "step": 2540
1821
+ },
1822
+ {
1823
+ "epoch": 0.05235348687129432,
1824
+ "grad_norm": 0.04638671875,
1825
+ "learning_rate": 1.997114079597885e-05,
1826
+ "loss": 4.4264,
1827
+ "step": 2550
1828
+ },
1829
+ {
1830
+ "epoch": 0.05275682652361554,
1831
+ "grad_norm": 0.04638671875,
1832
+ "learning_rate": 1.997065682653545e-05,
1833
+ "loss": 4.4372,
1834
+ "step": 2560
1835
+ },
1836
+ {
1837
+ "epoch": 0.05316016617593675,
1838
+ "grad_norm": 0.045166015625,
1839
+ "learning_rate": 1.997016883863993e-05,
1840
+ "loss": 4.4566,
1841
+ "step": 2570
1842
+ },
1843
+ {
1844
+ "epoch": 0.05356350582825797,
1845
+ "grad_norm": 0.04345703125,
1846
+ "learning_rate": 1.9969676832488965e-05,
1847
+ "loss": 4.4309,
1848
+ "step": 2580
1849
+ },
1850
+ {
1851
+ "epoch": 0.053966845480579194,
1852
+ "grad_norm": 0.046142578125,
1853
+ "learning_rate": 1.9969180808280845e-05,
1854
+ "loss": 4.4621,
1855
+ "step": 2590
1856
+ },
1857
+ {
1858
+ "epoch": 0.054370185132900414,
1859
+ "grad_norm": 0.04443359375,
1860
+ "learning_rate": 1.9968680766215477e-05,
1861
+ "loss": 4.4465,
1862
+ "step": 2600
1863
+ },
1864
+ {
1865
+ "epoch": 0.054773524785221635,
1866
+ "grad_norm": 0.04541015625,
1867
+ "learning_rate": 1.9968176706494403e-05,
1868
+ "loss": 4.4239,
1869
+ "step": 2610
1870
+ },
1871
+ {
1872
+ "epoch": 0.055176864437542855,
1873
+ "grad_norm": 0.042724609375,
1874
+ "learning_rate": 1.996766862932076e-05,
1875
+ "loss": 4.4567,
1876
+ "step": 2620
1877
+ },
1878
+ {
1879
+ "epoch": 0.055580204089864076,
1880
+ "grad_norm": 0.046142578125,
1881
+ "learning_rate": 1.996715653489933e-05,
1882
+ "loss": 4.423,
1883
+ "step": 2630
1884
+ },
1885
+ {
1886
+ "epoch": 0.055983543742185296,
1887
+ "grad_norm": 0.04248046875,
1888
+ "learning_rate": 1.9966640423436492e-05,
1889
+ "loss": 4.3849,
1890
+ "step": 2640
1891
+ },
1892
+ {
1893
+ "epoch": 0.056386883394506516,
1894
+ "grad_norm": 0.0458984375,
1895
+ "learning_rate": 1.9966120295140258e-05,
1896
+ "loss": 4.4788,
1897
+ "step": 2650
1898
+ },
1899
+ {
1900
+ "epoch": 0.05679022304682774,
1901
+ "grad_norm": 0.045166015625,
1902
+ "learning_rate": 1.996559615022025e-05,
1903
+ "loss": 4.4313,
1904
+ "step": 2660
1905
+ },
1906
+ {
1907
+ "epoch": 0.05719356269914895,
1908
+ "grad_norm": 0.044921875,
1909
+ "learning_rate": 1.996506798888772e-05,
1910
+ "loss": 4.4493,
1911
+ "step": 2670
1912
+ },
1913
+ {
1914
+ "epoch": 0.05759690235147017,
1915
+ "grad_norm": 0.047607421875,
1916
+ "learning_rate": 1.9964535811355524e-05,
1917
+ "loss": 4.4438,
1918
+ "step": 2680
1919
+ },
1920
+ {
1921
+ "epoch": 0.05800024200379139,
1922
+ "grad_norm": 0.04296875,
1923
+ "learning_rate": 1.996399961783815e-05,
1924
+ "loss": 4.413,
1925
+ "step": 2690
1926
+ },
1927
+ {
1928
+ "epoch": 0.05840358165611261,
1929
+ "grad_norm": 0.043701171875,
1930
+ "learning_rate": 1.9963459408551693e-05,
1931
+ "loss": 4.4538,
1932
+ "step": 2700
1933
+ },
1934
+ {
1935
+ "epoch": 0.05880692130843383,
1936
+ "grad_norm": 0.0439453125,
1937
+ "learning_rate": 1.996291518371388e-05,
1938
+ "loss": 4.466,
1939
+ "step": 2710
1940
+ },
1941
+ {
1942
+ "epoch": 0.05921026096075505,
1943
+ "grad_norm": 0.045654296875,
1944
+ "learning_rate": 1.9962366943544045e-05,
1945
+ "loss": 4.4963,
1946
+ "step": 2720
1947
+ },
1948
+ {
1949
+ "epoch": 0.05961360061307627,
1950
+ "grad_norm": 0.04541015625,
1951
+ "learning_rate": 1.9961814688263138e-05,
1952
+ "loss": 4.46,
1953
+ "step": 2730
1954
+ },
1955
+ {
1956
+ "epoch": 0.06001694026539749,
1957
+ "grad_norm": 0.04345703125,
1958
+ "learning_rate": 1.9961258418093745e-05,
1959
+ "loss": 4.4481,
1960
+ "step": 2740
1961
+ },
1962
+ {
1963
+ "epoch": 0.060420279917718714,
1964
+ "grad_norm": 0.044677734375,
1965
+ "learning_rate": 1.9960698133260053e-05,
1966
+ "loss": 4.442,
1967
+ "step": 2750
1968
+ },
1969
+ {
1970
+ "epoch": 0.06082361957003993,
1971
+ "grad_norm": 0.052490234375,
1972
+ "learning_rate": 1.9960133833987866e-05,
1973
+ "loss": 4.4473,
1974
+ "step": 2760
1975
+ },
1976
+ {
1977
+ "epoch": 0.06122695922236115,
1978
+ "grad_norm": 0.07958984375,
1979
+ "learning_rate": 1.9959565520504625e-05,
1980
+ "loss": 4.4286,
1981
+ "step": 2770
1982
+ },
1983
+ {
1984
+ "epoch": 0.06163029887468237,
1985
+ "grad_norm": 0.042724609375,
1986
+ "learning_rate": 1.9958993193039365e-05,
1987
+ "loss": 4.4616,
1988
+ "step": 2780
1989
+ },
1990
+ {
1991
+ "epoch": 0.06203363852700359,
1992
+ "grad_norm": 0.04296875,
1993
+ "learning_rate": 1.9958416851822755e-05,
1994
+ "loss": 4.4409,
1995
+ "step": 2790
1996
+ },
1997
+ {
1998
+ "epoch": 0.06243697817932481,
1999
+ "grad_norm": 0.042236328125,
2000
+ "learning_rate": 1.9957836497087074e-05,
2001
+ "loss": 4.4416,
2002
+ "step": 2800
2003
+ },
2004
+ {
2005
+ "epoch": 0.06284031783164602,
2006
+ "grad_norm": 0.04541015625,
2007
+ "learning_rate": 1.9957252129066227e-05,
2008
+ "loss": 4.4552,
2009
+ "step": 2810
2010
+ },
2011
+ {
2012
+ "epoch": 0.06324365748396725,
2013
+ "grad_norm": 0.046875,
2014
+ "learning_rate": 1.9956663747995724e-05,
2015
+ "loss": 4.4536,
2016
+ "step": 2820
2017
+ },
2018
+ {
2019
+ "epoch": 0.06364699713628846,
2020
+ "grad_norm": 0.045654296875,
2021
+ "learning_rate": 1.99560713541127e-05,
2022
+ "loss": 4.4544,
2023
+ "step": 2830
2024
+ },
2025
+ {
2026
+ "epoch": 0.06405033678860969,
2027
+ "grad_norm": 0.0419921875,
2028
+ "learning_rate": 1.9955474947655912e-05,
2029
+ "loss": 4.4288,
2030
+ "step": 2840
2031
+ },
2032
+ {
2033
+ "epoch": 0.0644536764409309,
2034
+ "grad_norm": 0.041748046875,
2035
+ "learning_rate": 1.995487452886572e-05,
2036
+ "loss": 4.4397,
2037
+ "step": 2850
2038
+ },
2039
+ {
2040
+ "epoch": 0.06485701609325213,
2041
+ "grad_norm": 0.044677734375,
2042
+ "learning_rate": 1.995427009798411e-05,
2043
+ "loss": 4.4909,
2044
+ "step": 2860
2045
+ },
2046
+ {
2047
+ "epoch": 0.06526035574557335,
2048
+ "grad_norm": 0.043701171875,
2049
+ "learning_rate": 1.9953661655254695e-05,
2050
+ "loss": 4.4528,
2051
+ "step": 2870
2052
+ },
2053
+ {
2054
+ "epoch": 0.06566369539789457,
2055
+ "grad_norm": 0.045654296875,
2056
+ "learning_rate": 1.9953049200922684e-05,
2057
+ "loss": 4.4308,
2058
+ "step": 2880
2059
+ },
2060
+ {
2061
+ "epoch": 0.06606703505021579,
2062
+ "grad_norm": 0.04541015625,
2063
+ "learning_rate": 1.9952432735234918e-05,
2064
+ "loss": 4.4585,
2065
+ "step": 2890
2066
+ },
2067
+ {
2068
+ "epoch": 0.066470374702537,
2069
+ "grad_norm": 0.046630859375,
2070
+ "learning_rate": 1.9951812258439846e-05,
2071
+ "loss": 4.4663,
2072
+ "step": 2900
2073
+ },
2074
+ {
2075
+ "epoch": 0.06687371435485823,
2076
+ "grad_norm": 0.0439453125,
2077
+ "learning_rate": 1.995118777078754e-05,
2078
+ "loss": 4.4549,
2079
+ "step": 2910
2080
+ },
2081
+ {
2082
+ "epoch": 0.06727705400717944,
2083
+ "grad_norm": 0.04931640625,
2084
+ "learning_rate": 1.9950559272529686e-05,
2085
+ "loss": 4.4434,
2086
+ "step": 2920
2087
+ },
2088
+ {
2089
+ "epoch": 0.06768039365950067,
2090
+ "grad_norm": 0.04443359375,
2091
+ "learning_rate": 1.9949926763919586e-05,
2092
+ "loss": 4.4086,
2093
+ "step": 2930
2094
+ },
2095
+ {
2096
+ "epoch": 0.06808373331182188,
2097
+ "grad_norm": 0.045166015625,
2098
+ "learning_rate": 1.9949290245212157e-05,
2099
+ "loss": 4.4456,
2100
+ "step": 2940
2101
+ },
2102
+ {
2103
+ "epoch": 0.06848707296414311,
2104
+ "grad_norm": 0.042724609375,
2105
+ "learning_rate": 1.9948649716663936e-05,
2106
+ "loss": 4.4395,
2107
+ "step": 2950
2108
+ },
2109
+ {
2110
+ "epoch": 0.06889041261646432,
2111
+ "grad_norm": 0.048095703125,
2112
+ "learning_rate": 1.994800517853307e-05,
2113
+ "loss": 4.4357,
2114
+ "step": 2960
2115
+ },
2116
+ {
2117
+ "epoch": 0.06929375226878555,
2118
+ "grad_norm": 0.041015625,
2119
+ "learning_rate": 1.9947356631079337e-05,
2120
+ "loss": 4.4129,
2121
+ "step": 2970
2122
+ },
2123
+ {
2124
+ "epoch": 0.06969709192110676,
2125
+ "grad_norm": 0.042724609375,
2126
+ "learning_rate": 1.9946704074564105e-05,
2127
+ "loss": 4.4587,
2128
+ "step": 2980
2129
+ },
2130
+ {
2131
+ "epoch": 0.07010043157342799,
2132
+ "grad_norm": 0.04248046875,
2133
+ "learning_rate": 1.994604750925038e-05,
2134
+ "loss": 4.4844,
2135
+ "step": 2990
2136
+ },
2137
+ {
2138
+ "epoch": 0.0705037712257492,
2139
+ "grad_norm": 0.044921875,
2140
+ "learning_rate": 1.9945386935402775e-05,
2141
+ "loss": 4.4578,
2142
+ "step": 3000
2143
+ },
2144
+ {
2145
+ "epoch": 0.07090711087807042,
2146
+ "grad_norm": 0.04443359375,
2147
+ "learning_rate": 1.9944722353287518e-05,
2148
+ "loss": 4.4306,
2149
+ "step": 3010
2150
+ },
2151
+ {
2152
+ "epoch": 0.07131045053039164,
2153
+ "grad_norm": 0.04248046875,
2154
+ "learning_rate": 1.994405376317246e-05,
2155
+ "loss": 4.4542,
2156
+ "step": 3020
2157
+ },
2158
+ {
2159
+ "epoch": 0.07171379018271286,
2160
+ "grad_norm": 0.044921875,
2161
+ "learning_rate": 1.9943381165327053e-05,
2162
+ "loss": 4.4166,
2163
+ "step": 3030
2164
+ },
2165
+ {
2166
+ "epoch": 0.07211712983503409,
2167
+ "grad_norm": 0.045166015625,
2168
+ "learning_rate": 1.9942704560022378e-05,
2169
+ "loss": 4.4745,
2170
+ "step": 3040
2171
+ },
2172
+ {
2173
+ "epoch": 0.0725204694873553,
2174
+ "grad_norm": 0.045166015625,
2175
+ "learning_rate": 1.9942023947531122e-05,
2176
+ "loss": 4.4555,
2177
+ "step": 3050
2178
+ },
2179
+ {
2180
+ "epoch": 0.07292380913967653,
2181
+ "grad_norm": 0.047119140625,
2182
+ "learning_rate": 1.99413393281276e-05,
2183
+ "loss": 4.4133,
2184
+ "step": 3060
2185
+ },
2186
+ {
2187
+ "epoch": 0.07332714879199774,
2188
+ "grad_norm": 0.04638671875,
2189
+ "learning_rate": 1.9940650702087718e-05,
2190
+ "loss": 4.4555,
2191
+ "step": 3070
2192
+ },
2193
+ {
2194
+ "epoch": 0.07373048844431897,
2195
+ "grad_norm": 0.044677734375,
2196
+ "learning_rate": 1.9939958069689026e-05,
2197
+ "loss": 4.4011,
2198
+ "step": 3080
2199
+ },
2200
+ {
2201
+ "epoch": 0.07413382809664018,
2202
+ "grad_norm": 0.04541015625,
2203
+ "learning_rate": 1.9939261431210664e-05,
2204
+ "loss": 4.4595,
2205
+ "step": 3090
2206
+ },
2207
+ {
2208
+ "epoch": 0.0745371677489614,
2209
+ "grad_norm": 0.04296875,
2210
+ "learning_rate": 1.9938560786933398e-05,
2211
+ "loss": 4.452,
2212
+ "step": 3100
2213
+ },
2214
+ {
2215
+ "epoch": 0.07494050740128262,
2216
+ "grad_norm": 0.04345703125,
2217
+ "learning_rate": 1.9937856137139612e-05,
2218
+ "loss": 4.4497,
2219
+ "step": 3110
2220
+ },
2221
+ {
2222
+ "epoch": 0.07534384705360384,
2223
+ "grad_norm": 0.044677734375,
2224
+ "learning_rate": 1.9937147482113296e-05,
2225
+ "loss": 4.4514,
2226
+ "step": 3120
2227
+ },
2228
+ {
2229
+ "epoch": 0.07574718670592506,
2230
+ "grad_norm": 0.04541015625,
2231
+ "learning_rate": 1.993643482214006e-05,
2232
+ "loss": 4.4674,
2233
+ "step": 3130
2234
+ },
2235
+ {
2236
+ "epoch": 0.07615052635824628,
2237
+ "grad_norm": 0.0419921875,
2238
+ "learning_rate": 1.9935718157507124e-05,
2239
+ "loss": 4.4503,
2240
+ "step": 3140
2241
+ },
2242
+ {
2243
+ "epoch": 0.0765538660105675,
2244
+ "grad_norm": 0.044677734375,
2245
+ "learning_rate": 1.9934997488503325e-05,
2246
+ "loss": 4.4512,
2247
+ "step": 3150
2248
+ },
2249
+ {
2250
+ "epoch": 0.07695720566288872,
2251
+ "grad_norm": 0.044677734375,
2252
+ "learning_rate": 1.993427281541911e-05,
2253
+ "loss": 4.4441,
2254
+ "step": 3160
2255
+ },
2256
+ {
2257
+ "epoch": 0.07736054531520994,
2258
+ "grad_norm": 0.044189453125,
2259
+ "learning_rate": 1.9933544138546542e-05,
2260
+ "loss": 4.4542,
2261
+ "step": 3170
2262
+ },
2263
+ {
2264
+ "epoch": 0.07776388496753116,
2265
+ "grad_norm": 0.0439453125,
2266
+ "learning_rate": 1.9932811458179305e-05,
2267
+ "loss": 4.4436,
2268
+ "step": 3180
2269
+ },
2270
+ {
2271
+ "epoch": 0.07816722461985237,
2272
+ "grad_norm": 0.047607421875,
2273
+ "learning_rate": 1.993207477461268e-05,
2274
+ "loss": 4.4158,
2275
+ "step": 3190
2276
+ },
2277
+ {
2278
+ "epoch": 0.0785705642721736,
2279
+ "grad_norm": 0.04443359375,
2280
+ "learning_rate": 1.993133408814358e-05,
2281
+ "loss": 4.4518,
2282
+ "step": 3200
2283
+ },
2284
+ {
2285
+ "epoch": 0.07897390392449481,
2286
+ "grad_norm": 0.044921875,
2287
+ "learning_rate": 1.9930589399070515e-05,
2288
+ "loss": 4.4289,
2289
+ "step": 3210
2290
+ },
2291
+ {
2292
+ "epoch": 0.07937724357681604,
2293
+ "grad_norm": 0.045654296875,
2294
+ "learning_rate": 1.9929840707693618e-05,
2295
+ "loss": 4.4452,
2296
+ "step": 3220
2297
+ },
2298
+ {
2299
+ "epoch": 0.07978058322913725,
2300
+ "grad_norm": 0.042724609375,
2301
+ "learning_rate": 1.9929088014314636e-05,
2302
+ "loss": 4.462,
2303
+ "step": 3230
2304
+ },
2305
+ {
2306
+ "epoch": 0.08018392288145848,
2307
+ "grad_norm": 0.04345703125,
2308
+ "learning_rate": 1.992833131923692e-05,
2309
+ "loss": 4.4282,
2310
+ "step": 3240
2311
+ },
2312
+ {
2313
+ "epoch": 0.0805872625337797,
2314
+ "grad_norm": 0.0439453125,
2315
+ "learning_rate": 1.9927570622765443e-05,
2316
+ "loss": 4.4584,
2317
+ "step": 3250
2318
+ },
2319
+ {
2320
+ "epoch": 0.08099060218610092,
2321
+ "grad_norm": 0.045166015625,
2322
+ "learning_rate": 1.9926805925206784e-05,
2323
+ "loss": 4.455,
2324
+ "step": 3260
2325
+ },
2326
+ {
2327
+ "epoch": 0.08139394183842213,
2328
+ "grad_norm": 0.043212890625,
2329
+ "learning_rate": 1.992603722686914e-05,
2330
+ "loss": 4.4233,
2331
+ "step": 3270
2332
+ },
2333
+ {
2334
+ "epoch": 0.08179728149074335,
2335
+ "grad_norm": 0.043701171875,
2336
+ "learning_rate": 1.9925264528062317e-05,
2337
+ "loss": 4.4435,
2338
+ "step": 3280
2339
+ },
2340
+ {
2341
+ "epoch": 0.08220062114306458,
2342
+ "grad_norm": 0.04541015625,
2343
+ "learning_rate": 1.9924487829097733e-05,
2344
+ "loss": 4.4385,
2345
+ "step": 3290
2346
+ },
2347
+ {
2348
+ "epoch": 0.08260396079538579,
2349
+ "grad_norm": 0.04248046875,
2350
+ "learning_rate": 1.9923707130288415e-05,
2351
+ "loss": 4.4204,
2352
+ "step": 3300
2353
+ },
2354
+ {
2355
+ "epoch": 0.08300730044770702,
2356
+ "grad_norm": 0.043701171875,
2357
+ "learning_rate": 1.9922922431949017e-05,
2358
+ "loss": 4.4202,
2359
+ "step": 3310
2360
+ },
2361
+ {
2362
+ "epoch": 0.08341064010002823,
2363
+ "grad_norm": 0.04296875,
2364
+ "learning_rate": 1.9922133734395787e-05,
2365
+ "loss": 4.394,
2366
+ "step": 3320
2367
+ },
2368
+ {
2369
+ "epoch": 0.08381397975234946,
2370
+ "grad_norm": 0.04443359375,
2371
+ "learning_rate": 1.9921341037946592e-05,
2372
+ "loss": 4.4216,
2373
+ "step": 3330
2374
+ },
2375
+ {
2376
+ "epoch": 0.08421731940467067,
2377
+ "grad_norm": 0.044677734375,
2378
+ "learning_rate": 1.9920544342920913e-05,
2379
+ "loss": 4.4231,
2380
+ "step": 3340
2381
+ },
2382
+ {
2383
+ "epoch": 0.0846206590569919,
2384
+ "grad_norm": 0.0458984375,
2385
+ "learning_rate": 1.991974364963984e-05,
2386
+ "loss": 4.4847,
2387
+ "step": 3350
2388
+ },
2389
+ {
2390
+ "epoch": 0.08502399870931311,
2391
+ "grad_norm": 0.04248046875,
2392
+ "learning_rate": 1.9918938958426075e-05,
2393
+ "loss": 4.4063,
2394
+ "step": 3360
2395
+ },
2396
+ {
2397
+ "epoch": 0.08542733836163434,
2398
+ "grad_norm": 0.0458984375,
2399
+ "learning_rate": 1.9918130269603926e-05,
2400
+ "loss": 4.4668,
2401
+ "step": 3370
2402
+ },
2403
+ {
2404
+ "epoch": 0.08583067801395555,
2405
+ "grad_norm": 0.0458984375,
2406
+ "learning_rate": 1.991731758349933e-05,
2407
+ "loss": 4.4454,
2408
+ "step": 3380
2409
+ },
2410
+ {
2411
+ "epoch": 0.08623401766627677,
2412
+ "grad_norm": 0.04541015625,
2413
+ "learning_rate": 1.9916500900439806e-05,
2414
+ "loss": 4.4527,
2415
+ "step": 3390
2416
+ },
2417
+ {
2418
+ "epoch": 0.086637357318598,
2419
+ "grad_norm": 0.042724609375,
2420
+ "learning_rate": 1.991568022075451e-05,
2421
+ "loss": 4.4057,
2422
+ "step": 3400
2423
+ },
2424
+ {
2425
+ "epoch": 0.0870406969709192,
2426
+ "grad_norm": 0.044189453125,
2427
+ "learning_rate": 1.9914855544774195e-05,
2428
+ "loss": 4.4503,
2429
+ "step": 3410
2430
+ },
2431
+ {
2432
+ "epoch": 0.08744403662324043,
2433
+ "grad_norm": 0.043212890625,
2434
+ "learning_rate": 1.991402687283123e-05,
2435
+ "loss": 4.4968,
2436
+ "step": 3420
2437
+ },
2438
+ {
2439
+ "epoch": 0.08784737627556165,
2440
+ "grad_norm": 0.045166015625,
2441
+ "learning_rate": 1.9913194205259595e-05,
2442
+ "loss": 4.4642,
2443
+ "step": 3430
2444
+ },
2445
+ {
2446
+ "epoch": 0.08825071592788288,
2447
+ "grad_norm": 0.0439453125,
2448
+ "learning_rate": 1.9912357542394873e-05,
2449
+ "loss": 4.4283,
2450
+ "step": 3440
2451
+ },
2452
+ {
2453
+ "epoch": 0.08865405558020409,
2454
+ "grad_norm": 0.0458984375,
2455
+ "learning_rate": 1.9911516884574262e-05,
2456
+ "loss": 4.4776,
2457
+ "step": 3450
2458
+ },
2459
+ {
2460
+ "epoch": 0.08905739523252532,
2461
+ "grad_norm": 0.045166015625,
2462
+ "learning_rate": 1.9910672232136578e-05,
2463
+ "loss": 4.4578,
2464
+ "step": 3460
2465
+ },
2466
+ {
2467
+ "epoch": 0.08946073488484653,
2468
+ "grad_norm": 0.04443359375,
2469
+ "learning_rate": 1.990982358542223e-05,
2470
+ "loss": 4.4718,
2471
+ "step": 3470
2472
+ },
2473
+ {
2474
+ "epoch": 0.08986407453716774,
2475
+ "grad_norm": 0.043212890625,
2476
+ "learning_rate": 1.9908970944773255e-05,
2477
+ "loss": 4.4575,
2478
+ "step": 3480
2479
+ },
2480
+ {
2481
+ "epoch": 0.09026741418948897,
2482
+ "grad_norm": 0.04296875,
2483
+ "learning_rate": 1.9908114310533285e-05,
2484
+ "loss": 4.4147,
2485
+ "step": 3490
2486
+ },
2487
+ {
2488
+ "epoch": 0.09067075384181018,
2489
+ "grad_norm": 0.048095703125,
2490
+ "learning_rate": 1.990725368304757e-05,
2491
+ "loss": 4.4363,
2492
+ "step": 3500
2493
+ },
2494
+ {
2495
+ "epoch": 0.09107409349413141,
2496
+ "grad_norm": 0.0439453125,
2497
+ "learning_rate": 1.990638906266297e-05,
2498
+ "loss": 4.4575,
2499
+ "step": 3510
2500
+ },
2501
+ {
2502
+ "epoch": 0.09147743314645262,
2503
+ "grad_norm": 0.043212890625,
2504
+ "learning_rate": 1.990552044972794e-05,
2505
+ "loss": 4.4545,
2506
+ "step": 3520
2507
+ },
2508
+ {
2509
+ "epoch": 0.09188077279877385,
2510
+ "grad_norm": 0.044189453125,
2511
+ "learning_rate": 1.9904647844592572e-05,
2512
+ "loss": 4.4439,
2513
+ "step": 3530
2514
+ },
2515
+ {
2516
+ "epoch": 0.09228411245109507,
2517
+ "grad_norm": 0.043701171875,
2518
+ "learning_rate": 1.9903771247608535e-05,
2519
+ "loss": 4.4192,
2520
+ "step": 3540
2521
+ },
2522
+ {
2523
+ "epoch": 0.09268745210341629,
2524
+ "grad_norm": 0.044677734375,
2525
+ "learning_rate": 1.9902890659129125e-05,
2526
+ "loss": 4.4251,
2527
+ "step": 3550
2528
+ },
2529
+ {
2530
+ "epoch": 0.0930907917557375,
2531
+ "grad_norm": 0.041259765625,
2532
+ "learning_rate": 1.990200607950925e-05,
2533
+ "loss": 4.4339,
2534
+ "step": 3560
2535
+ },
2536
+ {
2537
+ "epoch": 0.09349413140805872,
2538
+ "grad_norm": 0.04541015625,
2539
+ "learning_rate": 1.9901117509105417e-05,
2540
+ "loss": 4.4635,
2541
+ "step": 3570
2542
+ },
2543
+ {
2544
+ "epoch": 0.09389747106037995,
2545
+ "grad_norm": 0.044921875,
2546
+ "learning_rate": 1.990022494827574e-05,
2547
+ "loss": 4.4255,
2548
+ "step": 3580
2549
+ },
2550
+ {
2551
+ "epoch": 0.09430081071270116,
2552
+ "grad_norm": 0.044677734375,
2553
+ "learning_rate": 1.9899328397379955e-05,
2554
+ "loss": 4.4412,
2555
+ "step": 3590
2556
+ },
2557
+ {
2558
+ "epoch": 0.09470415036502239,
2559
+ "grad_norm": 0.042724609375,
2560
+ "learning_rate": 1.989842785677939e-05,
2561
+ "loss": 4.437,
2562
+ "step": 3600
2563
+ },
2564
+ {
2565
+ "epoch": 0.0951074900173436,
2566
+ "grad_norm": 0.04541015625,
2567
+ "learning_rate": 1.9897523326836987e-05,
2568
+ "loss": 4.4136,
2569
+ "step": 3610
2570
+ },
2571
+ {
2572
+ "epoch": 0.09551082966966483,
2573
+ "grad_norm": 0.047607421875,
2574
+ "learning_rate": 1.98966148079173e-05,
2575
+ "loss": 4.4353,
2576
+ "step": 3620
2577
+ },
2578
+ {
2579
+ "epoch": 0.09591416932198604,
2580
+ "grad_norm": 0.044677734375,
2581
+ "learning_rate": 1.989570230038649e-05,
2582
+ "loss": 4.4331,
2583
+ "step": 3630
2584
+ },
2585
+ {
2586
+ "epoch": 0.09631750897430727,
2587
+ "grad_norm": 0.043212890625,
2588
+ "learning_rate": 1.989478580461232e-05,
2589
+ "loss": 4.4311,
2590
+ "step": 3640
2591
+ },
2592
+ {
2593
+ "epoch": 0.09672084862662848,
2594
+ "grad_norm": 0.04443359375,
2595
+ "learning_rate": 1.9893865320964162e-05,
2596
+ "loss": 4.3961,
2597
+ "step": 3650
2598
+ },
2599
+ {
2600
+ "epoch": 0.0971241882789497,
2601
+ "grad_norm": 0.044921875,
2602
+ "learning_rate": 1.9892940849812997e-05,
2603
+ "loss": 4.4645,
2604
+ "step": 3660
2605
+ },
2606
+ {
2607
+ "epoch": 0.09752752793127092,
2608
+ "grad_norm": 0.046142578125,
2609
+ "learning_rate": 1.9892012391531413e-05,
2610
+ "loss": 4.495,
2611
+ "step": 3670
2612
+ },
2613
+ {
2614
+ "epoch": 0.09793086758359214,
2615
+ "grad_norm": 0.044677734375,
2616
+ "learning_rate": 1.989107994649361e-05,
2617
+ "loss": 4.4074,
2618
+ "step": 3680
2619
+ },
2620
+ {
2621
+ "epoch": 0.09833420723591337,
2622
+ "grad_norm": 0.04638671875,
2623
+ "learning_rate": 1.989014351507538e-05,
2624
+ "loss": 4.441,
2625
+ "step": 3690
2626
+ },
2627
+ {
2628
+ "epoch": 0.09873754688823458,
2629
+ "grad_norm": 0.045654296875,
2630
+ "learning_rate": 1.988920309765413e-05,
2631
+ "loss": 4.446,
2632
+ "step": 3700
2633
+ },
2634
+ {
2635
+ "epoch": 0.0991408865405558,
2636
+ "grad_norm": 0.045166015625,
2637
+ "learning_rate": 1.9888258694608886e-05,
2638
+ "loss": 4.4378,
2639
+ "step": 3710
2640
+ },
2641
+ {
2642
+ "epoch": 0.09954422619287702,
2643
+ "grad_norm": 0.044189453125,
2644
+ "learning_rate": 1.988731030632026e-05,
2645
+ "loss": 4.4068,
2646
+ "step": 3720
2647
+ },
2648
+ {
2649
+ "epoch": 0.09994756584519825,
2650
+ "grad_norm": 0.0458984375,
2651
+ "learning_rate": 1.988635793317048e-05,
2652
+ "loss": 4.4587,
2653
+ "step": 3730
2654
+ },
2655
+ {
2656
+ "epoch": 0.10035090549751946,
2657
+ "grad_norm": 0.042236328125,
2658
+ "learning_rate": 1.9885401575543384e-05,
2659
+ "loss": 4.4586,
2660
+ "step": 3740
2661
+ },
2662
+ {
2663
+ "epoch": 0.10075424514984069,
2664
+ "grad_norm": 0.044677734375,
2665
+ "learning_rate": 1.98844412338244e-05,
2666
+ "loss": 4.4342,
2667
+ "step": 3750
2668
+ },
2669
+ {
2670
+ "epoch": 0.1011575848021619,
2671
+ "grad_norm": 0.04638671875,
2672
+ "learning_rate": 1.9883476908400587e-05,
2673
+ "loss": 4.4647,
2674
+ "step": 3760
2675
+ },
2676
+ {
2677
+ "epoch": 0.10156092445448311,
2678
+ "grad_norm": 0.04345703125,
2679
+ "learning_rate": 1.9882508599660583e-05,
2680
+ "loss": 4.4362,
2681
+ "step": 3770
2682
+ },
2683
+ {
2684
+ "epoch": 0.10196426410680434,
2685
+ "grad_norm": 0.04248046875,
2686
+ "learning_rate": 1.9881536307994645e-05,
2687
+ "loss": 4.4402,
2688
+ "step": 3780
2689
+ },
2690
+ {
2691
+ "epoch": 0.10236760375912556,
2692
+ "grad_norm": 0.0439453125,
2693
+ "learning_rate": 1.9880560033794637e-05,
2694
+ "loss": 4.4238,
2695
+ "step": 3790
2696
+ },
2697
+ {
2698
+ "epoch": 0.10277094341144678,
2699
+ "grad_norm": 0.04443359375,
2700
+ "learning_rate": 1.9879579777454027e-05,
2701
+ "loss": 4.4556,
2702
+ "step": 3800
2703
+ },
2704
+ {
2705
+ "epoch": 0.103174283063768,
2706
+ "grad_norm": 0.049560546875,
2707
+ "learning_rate": 1.987859553936788e-05,
2708
+ "loss": 4.4287,
2709
+ "step": 3810
2710
+ },
2711
+ {
2712
+ "epoch": 0.10357762271608922,
2713
+ "grad_norm": 0.045654296875,
2714
+ "learning_rate": 1.9877607319932872e-05,
2715
+ "loss": 4.421,
2716
+ "step": 3820
2717
+ },
2718
+ {
2719
+ "epoch": 0.10398096236841044,
2720
+ "grad_norm": 0.04638671875,
2721
+ "learning_rate": 1.9876615119547286e-05,
2722
+ "loss": 4.4607,
2723
+ "step": 3830
2724
+ },
2725
+ {
2726
+ "epoch": 0.10438430202073166,
2727
+ "grad_norm": 0.04638671875,
2728
+ "learning_rate": 1.9875618938611008e-05,
2729
+ "loss": 4.4433,
2730
+ "step": 3840
2731
+ },
2732
+ {
2733
+ "epoch": 0.10478764167305288,
2734
+ "grad_norm": 0.04443359375,
2735
+ "learning_rate": 1.987461877752552e-05,
2736
+ "loss": 4.4713,
2737
+ "step": 3850
2738
+ },
2739
+ {
2740
+ "epoch": 0.10519098132537409,
2741
+ "grad_norm": 0.043701171875,
2742
+ "learning_rate": 1.9873614636693918e-05,
2743
+ "loss": 4.4631,
2744
+ "step": 3860
2745
+ },
2746
+ {
2747
+ "epoch": 0.10559432097769532,
2748
+ "grad_norm": 0.045166015625,
2749
+ "learning_rate": 1.9872606516520898e-05,
2750
+ "loss": 4.3911,
2751
+ "step": 3870
2752
+ },
2753
+ {
2754
+ "epoch": 0.10599766063001653,
2755
+ "grad_norm": 0.0458984375,
2756
+ "learning_rate": 1.9871594417412763e-05,
2757
+ "loss": 4.451,
2758
+ "step": 3880
2759
+ },
2760
+ {
2761
+ "epoch": 0.10640100028233776,
2762
+ "grad_norm": 0.04541015625,
2763
+ "learning_rate": 1.9870578339777416e-05,
2764
+ "loss": 4.4256,
2765
+ "step": 3890
2766
+ },
2767
+ {
2768
+ "epoch": 0.10680433993465897,
2769
+ "grad_norm": 0.04541015625,
2770
+ "learning_rate": 1.9869558284024363e-05,
2771
+ "loss": 4.4336,
2772
+ "step": 3900
2773
+ },
2774
+ {
2775
+ "epoch": 0.1072076795869802,
2776
+ "grad_norm": 0.044921875,
2777
+ "learning_rate": 1.9868534250564713e-05,
2778
+ "loss": 4.464,
2779
+ "step": 3910
2780
+ },
2781
+ {
2782
+ "epoch": 0.10761101923930141,
2783
+ "grad_norm": 0.042236328125,
2784
+ "learning_rate": 1.9867506239811188e-05,
2785
+ "loss": 4.4258,
2786
+ "step": 3920
2787
+ },
2788
+ {
2789
+ "epoch": 0.10801435889162264,
2790
+ "grad_norm": 0.04736328125,
2791
+ "learning_rate": 1.9866474252178096e-05,
2792
+ "loss": 4.4037,
2793
+ "step": 3930
2794
+ },
2795
+ {
2796
+ "epoch": 0.10841769854394386,
2797
+ "grad_norm": 0.04345703125,
2798
+ "learning_rate": 1.9865438288081366e-05,
2799
+ "loss": 4.383,
2800
+ "step": 3940
2801
+ },
2802
+ {
2803
+ "epoch": 0.10882103819626507,
2804
+ "grad_norm": 0.0439453125,
2805
+ "learning_rate": 1.986439834793851e-05,
2806
+ "loss": 4.4667,
2807
+ "step": 3950
2808
+ },
2809
+ {
2810
+ "epoch": 0.1092243778485863,
2811
+ "grad_norm": 0.044677734375,
2812
+ "learning_rate": 1.986335443216866e-05,
2813
+ "loss": 4.4176,
2814
+ "step": 3960
2815
+ },
2816
+ {
2817
+ "epoch": 0.10962771750090751,
2818
+ "grad_norm": 0.044189453125,
2819
+ "learning_rate": 1.9862306541192536e-05,
2820
+ "loss": 4.4293,
2821
+ "step": 3970
2822
+ },
2823
+ {
2824
+ "epoch": 0.11003105715322874,
2825
+ "grad_norm": 0.047119140625,
2826
+ "learning_rate": 1.9861254675432478e-05,
2827
+ "loss": 4.4302,
2828
+ "step": 3980
2829
+ },
2830
+ {
2831
+ "epoch": 0.11043439680554995,
2832
+ "grad_norm": 0.04541015625,
2833
+ "learning_rate": 1.9860198835312408e-05,
2834
+ "loss": 4.4271,
2835
+ "step": 3990
2836
+ },
2837
+ {
2838
+ "epoch": 0.11083773645787118,
2839
+ "grad_norm": 0.044677734375,
2840
+ "learning_rate": 1.985913902125786e-05,
2841
+ "loss": 4.4409,
2842
+ "step": 4000
2843
+ },
2844
+ {
2845
+ "epoch": 0.11124107611019239,
2846
+ "grad_norm": 0.04541015625,
2847
+ "learning_rate": 1.9858075233695974e-05,
2848
+ "loss": 4.4272,
2849
+ "step": 4010
2850
+ },
2851
+ {
2852
+ "epoch": 0.11164441576251362,
2853
+ "grad_norm": 0.0439453125,
2854
+ "learning_rate": 1.9857007473055482e-05,
2855
+ "loss": 4.426,
2856
+ "step": 4020
2857
+ },
2858
+ {
2859
+ "epoch": 0.11204775541483483,
2860
+ "grad_norm": 0.0439453125,
2861
+ "learning_rate": 1.9855935739766724e-05,
2862
+ "loss": 4.4782,
2863
+ "step": 4030
2864
+ },
2865
+ {
2866
+ "epoch": 0.11245109506715605,
2867
+ "grad_norm": 0.047119140625,
2868
+ "learning_rate": 1.9854860034261635e-05,
2869
+ "loss": 4.4529,
2870
+ "step": 4040
2871
+ },
2872
+ {
2873
+ "epoch": 0.11285443471947727,
2874
+ "grad_norm": 0.044677734375,
2875
+ "learning_rate": 1.9853780356973757e-05,
2876
+ "loss": 4.413,
2877
+ "step": 4050
2878
+ },
2879
+ {
2880
+ "epoch": 0.11325777437179849,
2881
+ "grad_norm": 0.045166015625,
2882
+ "learning_rate": 1.9852696708338224e-05,
2883
+ "loss": 4.4179,
2884
+ "step": 4060
2885
+ },
2886
+ {
2887
+ "epoch": 0.11366111402411971,
2888
+ "grad_norm": 0.04443359375,
2889
+ "learning_rate": 1.9851609088791783e-05,
2890
+ "loss": 4.4679,
2891
+ "step": 4070
2892
+ },
2893
+ {
2894
+ "epoch": 0.11406445367644093,
2895
+ "grad_norm": 0.04345703125,
2896
+ "learning_rate": 1.9850517498772775e-05,
2897
+ "loss": 4.4556,
2898
+ "step": 4080
2899
+ },
2900
+ {
2901
+ "epoch": 0.11446779332876215,
2902
+ "grad_norm": 0.043701171875,
2903
+ "learning_rate": 1.9849421938721137e-05,
2904
+ "loss": 4.4282,
2905
+ "step": 4090
2906
+ },
2907
+ {
2908
+ "epoch": 0.11487113298108337,
2909
+ "grad_norm": 0.0439453125,
2910
+ "learning_rate": 1.9848322409078412e-05,
2911
+ "loss": 4.4264,
2912
+ "step": 4100
2913
+ },
2914
+ {
2915
+ "epoch": 0.1152744726334046,
2916
+ "grad_norm": 0.045166015625,
2917
+ "learning_rate": 1.9847218910287743e-05,
2918
+ "loss": 4.4565,
2919
+ "step": 4110
2920
+ },
2921
+ {
2922
+ "epoch": 0.11567781228572581,
2923
+ "grad_norm": 0.04736328125,
2924
+ "learning_rate": 1.9846111442793866e-05,
2925
+ "loss": 4.4284,
2926
+ "step": 4120
2927
+ },
2928
+ {
2929
+ "epoch": 0.11608115193804702,
2930
+ "grad_norm": 0.045166015625,
2931
+ "learning_rate": 1.984500000704313e-05,
2932
+ "loss": 4.4254,
2933
+ "step": 4130
2934
+ },
2935
+ {
2936
+ "epoch": 0.11648449159036825,
2937
+ "grad_norm": 0.0458984375,
2938
+ "learning_rate": 1.9843884603483464e-05,
2939
+ "loss": 4.4437,
2940
+ "step": 4140
2941
+ },
2942
+ {
2943
+ "epoch": 0.11688783124268946,
2944
+ "grad_norm": 0.0439453125,
2945
+ "learning_rate": 1.9842765232564415e-05,
2946
+ "loss": 4.4201,
2947
+ "step": 4150
2948
+ },
2949
+ {
2950
+ "epoch": 0.11729117089501069,
2951
+ "grad_norm": 0.044921875,
2952
+ "learning_rate": 1.9841641894737113e-05,
2953
+ "loss": 4.442,
2954
+ "step": 4160
2955
+ },
2956
+ {
2957
+ "epoch": 0.1176945105473319,
2958
+ "grad_norm": 0.044189453125,
2959
+ "learning_rate": 1.98405145904543e-05,
2960
+ "loss": 4.4023,
2961
+ "step": 4170
2962
+ },
2963
+ {
2964
+ "epoch": 0.11809785019965313,
2965
+ "grad_norm": 0.044921875,
2966
+ "learning_rate": 1.9839383320170308e-05,
2967
+ "loss": 4.4158,
2968
+ "step": 4180
2969
+ },
2970
+ {
2971
+ "epoch": 0.11850118985197435,
2972
+ "grad_norm": 0.0439453125,
2973
+ "learning_rate": 1.9838248084341077e-05,
2974
+ "loss": 4.4407,
2975
+ "step": 4190
2976
+ },
2977
+ {
2978
+ "epoch": 0.11890452950429557,
2979
+ "grad_norm": 0.044189453125,
2980
+ "learning_rate": 1.9837108883424128e-05,
2981
+ "loss": 4.4053,
2982
+ "step": 4200
2983
+ },
2984
+ {
2985
+ "epoch": 0.11930786915661679,
2986
+ "grad_norm": 0.044189453125,
2987
+ "learning_rate": 1.98359657178786e-05,
2988
+ "loss": 4.453,
2989
+ "step": 4210
2990
+ },
2991
+ {
2992
+ "epoch": 0.11971120880893801,
2993
+ "grad_norm": 0.0439453125,
2994
+ "learning_rate": 1.9834818588165216e-05,
2995
+ "loss": 4.428,
2996
+ "step": 4220
2997
+ },
2998
+ {
2999
+ "epoch": 0.12011454846125923,
3000
+ "grad_norm": 0.046875,
3001
+ "learning_rate": 1.98336674947463e-05,
3002
+ "loss": 4.4208,
3003
+ "step": 4230
3004
+ },
3005
+ {
3006
+ "epoch": 0.12051788811358044,
3007
+ "grad_norm": 0.0439453125,
3008
+ "learning_rate": 1.9832512438085776e-05,
3009
+ "loss": 4.4668,
3010
+ "step": 4240
3011
+ },
3012
+ {
3013
+ "epoch": 0.12092122776590167,
3014
+ "grad_norm": 0.04345703125,
3015
+ "learning_rate": 1.9831353418649168e-05,
3016
+ "loss": 4.4151,
3017
+ "step": 4250
3018
+ },
3019
+ {
3020
+ "epoch": 0.12132456741822288,
3021
+ "grad_norm": 0.0478515625,
3022
+ "learning_rate": 1.9830190436903587e-05,
3023
+ "loss": 4.4326,
3024
+ "step": 4260
3025
+ },
3026
+ {
3027
+ "epoch": 0.12172790707054411,
3028
+ "grad_norm": 0.044677734375,
3029
+ "learning_rate": 1.982902349331775e-05,
3030
+ "loss": 4.4254,
3031
+ "step": 4270
3032
+ },
3033
+ {
3034
+ "epoch": 0.12213124672286532,
3035
+ "grad_norm": 0.043212890625,
3036
+ "learning_rate": 1.9827852588361966e-05,
3037
+ "loss": 4.46,
3038
+ "step": 4280
3039
+ },
3040
+ {
3041
+ "epoch": 0.12253458637518655,
3042
+ "grad_norm": 0.04541015625,
3043
+ "learning_rate": 1.982667772250815e-05,
3044
+ "loss": 4.4317,
3045
+ "step": 4290
3046
+ },
3047
+ {
3048
+ "epoch": 0.12293792602750776,
3049
+ "grad_norm": 0.044677734375,
3050
+ "learning_rate": 1.9825498896229793e-05,
3051
+ "loss": 4.4348,
3052
+ "step": 4300
3053
+ },
3054
+ {
3055
+ "epoch": 0.12334126567982899,
3056
+ "grad_norm": 0.043701171875,
3057
+ "learning_rate": 1.9824316110002e-05,
3058
+ "loss": 4.4246,
3059
+ "step": 4310
3060
+ },
3061
+ {
3062
+ "epoch": 0.1237446053321502,
3063
+ "grad_norm": 0.0439453125,
3064
+ "learning_rate": 1.9823129364301474e-05,
3065
+ "loss": 4.4576,
3066
+ "step": 4320
3067
+ },
3068
+ {
3069
+ "epoch": 0.12414794498447142,
3070
+ "grad_norm": 0.04541015625,
3071
+ "learning_rate": 1.9821938659606496e-05,
3072
+ "loss": 4.4288,
3073
+ "step": 4330
3074
+ },
3075
+ {
3076
+ "epoch": 0.12455128463679264,
3077
+ "grad_norm": 0.0439453125,
3078
+ "learning_rate": 1.9820743996396957e-05,
3079
+ "loss": 4.4153,
3080
+ "step": 4340
3081
+ },
3082
+ {
3083
+ "epoch": 0.12495462428911386,
3084
+ "grad_norm": 0.046142578125,
3085
+ "learning_rate": 1.981954537515434e-05,
3086
+ "loss": 4.4412,
3087
+ "step": 4350
3088
+ },
3089
+ {
3090
+ "epoch": 0.12535796394143509,
3091
+ "grad_norm": 0.0458984375,
3092
+ "learning_rate": 1.9818342796361723e-05,
3093
+ "loss": 4.4165,
3094
+ "step": 4360
3095
+ },
3096
+ {
3097
+ "epoch": 0.1257613035937563,
3098
+ "grad_norm": 0.046630859375,
3099
+ "learning_rate": 1.981713626050378e-05,
3100
+ "loss": 4.4294,
3101
+ "step": 4370
3102
+ },
3103
+ {
3104
+ "epoch": 0.1261646432460775,
3105
+ "grad_norm": 0.045166015625,
3106
+ "learning_rate": 1.9815925768066776e-05,
3107
+ "loss": 4.4297,
3108
+ "step": 4380
3109
+ }
3110
+ ],
3111
+ "logging_steps": 10,
3112
+ "max_steps": 49586,
3113
+ "num_input_tokens_seen": 0,
3114
+ "num_train_epochs": 2,
3115
+ "save_steps": 313,
3116
+ "stateful_callbacks": {
3117
+ "TrainerControl": {
3118
+ "args": {
3119
+ "should_epoch_stop": false,
3120
+ "should_evaluate": false,
3121
+ "should_log": false,
3122
+ "should_save": true,
3123
+ "should_training_stop": false
3124
+ },
3125
+ "attributes": {}
3126
+ }
3127
+ },
3128
+ "total_flos": 1.014857922095612e+19,
3129
+ "train_batch_size": 2,
3130
+ "trial_name": null,
3131
+ "trial_params": null
3132
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:851261daa18b1a1011c324f3d85b62e997899ab3bdb437a88e53633344259ebc
3
+ size 5841