cterdam commited on
Commit
89e4d5b
·
verified ·
1 Parent(s): dabe020

Upload folder using huggingface_hub

Browse files
command.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ main.py deepseek
config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "deepseek-ai/deepseek-coder-1.3b-instruct",
3
+ "architectures": [
4
+ "LlamaForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 32013,
9
+ "eos_token_id": 32021,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 2048,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 5504,
14
+ "max_position_embeddings": 16384,
15
+ "model_type": "llama",
16
+ "num_attention_heads": 16,
17
+ "num_hidden_layers": 24,
18
+ "num_key_value_heads": 16,
19
+ "pretraining_tp": 1,
20
+ "rms_norm_eps": 1e-06,
21
+ "rope_scaling": {
22
+ "factor": 4.0,
23
+ "type": "linear"
24
+ },
25
+ "rope_theta": 100000,
26
+ "tie_word_embeddings": false,
27
+ "torch_dtype": "float32",
28
+ "transformers_version": "4.38.2",
29
+ "use_cache": true,
30
+ "vocab_size": 32256
31
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 32013,
4
+ "eos_token_id": 32021,
5
+ "transformers_version": "4.38.2"
6
+ }
model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc8b408211f3d6f12fa928573bae13aa44e32745654c884785497fd845f908d7
3
+ size 4986380064
model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5700ee949e4ad2afc984948f6240588c65a0e30033d9643f20b5db0321dee689
3
+ size 399532808
model.safetensors.index.json ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 5385887744
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "model-00002-of-00002.safetensors",
7
+ "model.embed_tokens.weight": "model-00001-of-00002.safetensors",
8
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
9
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
10
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
11
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
12
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
13
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
14
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
15
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
16
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
17
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
18
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
19
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
20
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
21
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
22
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
23
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
24
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
25
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
26
+ "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
27
+ "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
28
+ "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
29
+ "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
30
+ "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
31
+ "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
32
+ "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
33
+ "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
34
+ "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
35
+ "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
36
+ "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
37
+ "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
38
+ "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
39
+ "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
40
+ "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
41
+ "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
42
+ "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
43
+ "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
44
+ "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
45
+ "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
46
+ "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
47
+ "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
48
+ "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
49
+ "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
50
+ "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
51
+ "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
52
+ "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
53
+ "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
54
+ "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
55
+ "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
56
+ "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
57
+ "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
58
+ "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
59
+ "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
60
+ "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
61
+ "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
62
+ "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
63
+ "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
64
+ "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
65
+ "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
66
+ "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
67
+ "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
68
+ "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
69
+ "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
70
+ "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
71
+ "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
72
+ "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
73
+ "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
74
+ "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
75
+ "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
76
+ "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
77
+ "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
78
+ "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
79
+ "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
80
+ "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
81
+ "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
82
+ "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
83
+ "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
84
+ "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
85
+ "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
86
+ "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
87
+ "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
88
+ "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
89
+ "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
90
+ "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
91
+ "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
92
+ "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
93
+ "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
94
+ "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
95
+ "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
96
+ "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
97
+ "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
98
+ "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors",
99
+ "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
100
+ "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
101
+ "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
102
+ "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
103
+ "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
104
+ "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
105
+ "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
106
+ "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
107
+ "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors",
108
+ "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
109
+ "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
110
+ "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
111
+ "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
112
+ "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
113
+ "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
114
+ "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
115
+ "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
116
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
117
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
118
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
119
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
120
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
121
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
122
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
123
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
124
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
125
+ "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors",
126
+ "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
127
+ "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
128
+ "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
129
+ "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
130
+ "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
131
+ "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
132
+ "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
133
+ "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
134
+ "model.layers.21.input_layernorm.weight": "model-00001-of-00002.safetensors",
135
+ "model.layers.21.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
136
+ "model.layers.21.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
137
+ "model.layers.21.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
138
+ "model.layers.21.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
139
+ "model.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
140
+ "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
141
+ "model.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
142
+ "model.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
143
+ "model.layers.22.input_layernorm.weight": "model-00001-of-00002.safetensors",
144
+ "model.layers.22.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
145
+ "model.layers.22.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
146
+ "model.layers.22.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
147
+ "model.layers.22.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
148
+ "model.layers.22.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
149
+ "model.layers.22.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
150
+ "model.layers.22.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
151
+ "model.layers.22.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
152
+ "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors",
153
+ "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
154
+ "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
155
+ "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
156
+ "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
157
+ "model.layers.23.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
158
+ "model.layers.23.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
159
+ "model.layers.23.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
160
+ "model.layers.23.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
161
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
162
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
163
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
164
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
165
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
166
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
167
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
168
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
169
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
170
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
171
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
172
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
173
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
174
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
175
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
176
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
177
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
178
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
179
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
180
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
181
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
182
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
183
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
184
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
185
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
186
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
187
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
188
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
189
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
190
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
191
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
192
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
193
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
194
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
195
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
196
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
197
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
198
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
199
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
200
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
201
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
202
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
203
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
204
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
205
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
206
+ "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
207
+ "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
208
+ "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
209
+ "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
210
+ "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
211
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
212
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
213
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
214
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
215
+ "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
216
+ "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
217
+ "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
218
+ "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
219
+ "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
220
+ "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
221
+ "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
222
+ "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
223
+ "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
224
+ "model.norm.weight": "model-00002-of-00002.safetensors"
225
+ }
226
+ }
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2305a178c90a806a5a0a24ecec62cadd0a11e730fcd4f791ac633f449b8c675
3
+ size 2699039674
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc5ad86f4f472e3fbe661a3e37dcb4cc0d3f931baf861c388a004f41ab765191
3
+ size 14244
run_config.json ADDED
@@ -0,0 +1,604 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": {
3
+ "codellama": {
4
+ "base_model_id": "codellama/CodeLlama-7b-hf",
5
+ "quantitize": "int8",
6
+ "dataset": "Arithmetic_Simple",
7
+ "data_collator": "DataCollatorForSeq2Seq",
8
+ "lora_config": {
9
+ "r": 16,
10
+ "lora_alpha": 16,
11
+ "target_modules": [
12
+ "q_proj",
13
+ "k_proj",
14
+ "v_proj",
15
+ "o_proj",
16
+ "gate_proj",
17
+ "up_proj",
18
+ "down_proj"
19
+ ],
20
+ "lora_dropout": 0.05,
21
+ "bias": "none",
22
+ "task_type": "CAUSAL_LM"
23
+ },
24
+ "training_args": {
25
+ "output_dir": "codellama-output",
26
+ "warmup_steps": 100,
27
+ "per_device_train_batch_size": 1,
28
+ "per_device_eval_batch_size": 1,
29
+ "gradient_accumulation_steps": 4,
30
+ "max_steps": 10000,
31
+ "learning_rate": 0.0003,
32
+ "optim": "adamw_torch",
33
+ "logging_dir": "codellama-output-logs",
34
+ "logging_steps": 10,
35
+ "save_strategy": "steps",
36
+ "save_steps": 500,
37
+ "load_best_model_at_end": false,
38
+ "group_by_length": true,
39
+ "fp16": true,
40
+ "evaluation_strategy": "steps",
41
+ "eval_steps": 1000
42
+ },
43
+ "tokenizer": {
44
+ "tokenize_config": {
45
+ "truncation": true,
46
+ "max_length": 192,
47
+ "padding": "max_length"
48
+ },
49
+ "prompt_template": "config/qa_template.txt"
50
+ }
51
+ },
52
+ "phi-2": {
53
+ "base_model_id": "microsoft/phi-2",
54
+ "quantitize": "fp16",
55
+ "dataset": "Arithmetic_Simple",
56
+ "data_collator": "DataCollatorForLanguageModeling",
57
+ "lora_config": {
58
+ "r": 32,
59
+ "lora_alpha": 64,
60
+ "target_modules": [
61
+ "q_proj",
62
+ "k_proj",
63
+ "v_proj",
64
+ "dense",
65
+ "fc1",
66
+ "fc2"
67
+ ],
68
+ "bias": "none",
69
+ "lora_dropout": 0.05,
70
+ "task_type": "CAUSAL_LM"
71
+ },
72
+ "training_args": {
73
+ "output_dir": "phi2-output",
74
+ "warmup_steps": 500,
75
+ "per_device_train_batch_size": 1,
76
+ "per_device_eval_batch_size": 1,
77
+ "gradient_accumulation_steps": 4,
78
+ "max_steps": 100000,
79
+ "learning_rate": 0.0003,
80
+ "optim": "paged_adamw_8bit",
81
+ "logging_dir": "phi2-output-logs",
82
+ "logging_steps": 100,
83
+ "save_strategy": "steps",
84
+ "save_steps": 500,
85
+ "evaluation_strategy": "steps",
86
+ "eval_steps": 500,
87
+ "fp16": true
88
+ },
89
+ "tokenizer": {
90
+ "tokenize_config": {
91
+ "truncation": true,
92
+ "max_length": 512,
93
+ "padding": "max_length"
94
+ },
95
+ "prompt_template": "config/qa_template.txt"
96
+ }
97
+ },
98
+ "deepseek": {
99
+ "base_model_id": "deepseek-ai/deepseek-coder-1.3b-instruct",
100
+ "quantitize": "fp16",
101
+ "dataset": "ghcode_python",
102
+ "data_collator": "DataCollatorForLanguageModeling",
103
+ "lora_config": {
104
+ "r": 32,
105
+ "lora_alpha": 64,
106
+ "target_modules": [
107
+ "q_proj",
108
+ "k_proj",
109
+ "v_proj",
110
+ "o_proj",
111
+ "gate_proj",
112
+ "up_proj",
113
+ "down_proj"
114
+ ],
115
+ "bias": "none",
116
+ "lora_dropout": 0.05,
117
+ "task_type": "CAUSAL_LM"
118
+ },
119
+ "lora_large_config": {
120
+ "r": 128,
121
+ "lora_alpha": 256,
122
+ "target_modules": [
123
+ "q_proj",
124
+ "k_proj",
125
+ "v_proj",
126
+ "o_proj",
127
+ "gate_proj",
128
+ "up_proj",
129
+ "down_proj"
130
+ ],
131
+ "bias": "none",
132
+ "lora_dropout": 0.05,
133
+ "task_type": "CAUSAL_LM"
134
+ },
135
+ "p_tuning_config": {
136
+ "num_virtual_tokens": 16,
137
+ "num_transformer_submodules": 1,
138
+ "token_dim": 2048,
139
+ "encoder_hidden_size": 2048,
140
+ "task_type": "CAUSAL_LM"
141
+ },
142
+ "training_args": {
143
+ "output_dir": "runs/deepseek-ctrl-gh",
144
+ "warmup_steps": 0,
145
+ "per_device_train_batch_size": 8,
146
+ "per_device_eval_batch_size": 8,
147
+ "gradient_accumulation_steps": 4,
148
+ "max_steps": 15000,
149
+ "learning_rate": 2e-05,
150
+ "optim": "paged_adamw_8bit",
151
+ "logging_dir": "runs/deepseek-ctrl-gh/logs",
152
+ "logging_steps": 100,
153
+ "save_strategy": "steps",
154
+ "save_steps": 5000,
155
+ "evaluation_strategy": "steps",
156
+ "eval_steps": 5000,
157
+ "weight_decay": 0.01,
158
+ "fp16": true
159
+ },
160
+ "tokenizer": {
161
+ "tokenize_config": {
162
+ "truncation": true,
163
+ "max_length": 512,
164
+ "padding": "max_length"
165
+ },
166
+ "prompt_template": "config/qa_template.txt"
167
+ }
168
+ }
169
+ },
170
+ "dataset": {
171
+ "simple_dataset": {
172
+ "type": "huggingface",
173
+ "dataset_purpose": "downstream",
174
+ "name": "b-mc2/sql-create-context",
175
+ "train_split": 0.9,
176
+ "max_train_size": 100,
177
+ "filling_field": [
178
+ "question",
179
+ "context",
180
+ "answer"
181
+ ]
182
+ },
183
+ "testdset": {
184
+ "type": "local",
185
+ "dataset_purpose": "downstream",
186
+ "train_file": "data/Test/TestDataset.json",
187
+ "val_file": "data/Test/TestDataset.json",
188
+ "test_file": "data/Test/TestDataset.json",
189
+ "filling_field": [
190
+ "prompted_question",
191
+ "answer"
192
+ ]
193
+ },
194
+ "mixture_codegen": {
195
+ "filling_field": [
196
+ "Question",
197
+ "Answer"
198
+ ],
199
+ "dataset_purpose": "downstream"
200
+ },
201
+ "MathQA_Python_loader": {
202
+ "type": "list-like",
203
+ "dataset_purpose": "downstream",
204
+ "train": "data/MathQA_Python_processed/mathqa_python_train_clean_final.json",
205
+ "val": "data/MathQA_Python_processed/mathqa_python_dev_clean_final.json",
206
+ "test": "data/MathQA_Python_processed/mathqa_python_test_clean_final.json",
207
+ "filling_field": [
208
+ "Question",
209
+ "Answer"
210
+ ]
211
+ },
212
+ "APPS_loader": {
213
+ "type": "list-like",
214
+ "dataset_purpose": "downstream",
215
+ "train": "data/APPS/apps_train.json",
216
+ "val": "data/APPS/apps_dev.json",
217
+ "test": "data/APPS/test/apps_test_75.json",
218
+ "filling_field": [
219
+ "Question",
220
+ "Answer"
221
+ ]
222
+ },
223
+ "MBPP_loader": {
224
+ "type": "list-like",
225
+ "dataset_purpose": "downstream",
226
+ "train": "data/MBPP/mbpp_train.json",
227
+ "val": "data/MBPP/mbpp_dev.json",
228
+ "test": "data/MBPP/mbpp_test.json",
229
+ "filling_field": [
230
+ "Question",
231
+ "Answer"
232
+ ]
233
+ },
234
+ "Arithmetic_Simple": {
235
+ "type": "list-like",
236
+ "dataset_purpose": "downstream",
237
+ "attributes": {
238
+ "subjects": [
239
+ 1,
240
+ 2,
241
+ 3,
242
+ 4,
243
+ 5,
244
+ 6,
245
+ 7,
246
+ 8,
247
+ 9
248
+ ],
249
+ "lessons": [
250
+ "Max_Ops1_Bounds0_100",
251
+ "Max_Ops1_Bounds0_1000",
252
+ "Max_Ops2_Bounds0_100",
253
+ "Max_Ops2_Bounds0_1000",
254
+ "Max_Ops3_Bounds0_100",
255
+ "Max_Ops3_Bounds0_1000",
256
+ "Max_Ops4_Bounds0_100",
257
+ "Max_Ops4_Bounds0_1000",
258
+ "Max_Ops5_Bounds0_100",
259
+ "Max_Ops5_Bounds0_1000"
260
+ ]
261
+ },
262
+ "train": "data/Arithmetic/Curriculum_Simple",
263
+ "val": "data/Arithmetic/Curriculum_Simple",
264
+ "test": "data/Arithmetic/Curriculum_Simple",
265
+ "filling_field": [
266
+ "Question",
267
+ "Answer"
268
+ ]
269
+ },
270
+ "Arithmetic_Hard": {
271
+ "type": "list-like",
272
+ "dataset_purpose": "downstream",
273
+ "attributes": {
274
+ "subjects": [
275
+ 1,
276
+ 2,
277
+ 3,
278
+ 4,
279
+ 5,
280
+ 6,
281
+ 7,
282
+ 8,
283
+ 9
284
+ ],
285
+ "lessons": [
286
+ "Max_Ops1_Bounds-1000_1000",
287
+ "Max_Ops1_Bounds-100_100",
288
+ "Max_Ops1_Bounds0_100",
289
+ "Max_Ops1_Bounds0_1000",
290
+ "Max_Ops2_Bounds-1000_1000",
291
+ "Max_Ops2_Bounds-100_100",
292
+ "Max_Ops2_Bounds0_100",
293
+ "Max_Ops2_Bounds0_1000",
294
+ "Max_Ops3_Bounds-1000_1000",
295
+ "Max_Ops3_Bounds-100_100",
296
+ "Max_Ops3_Bounds0_100",
297
+ "Max_Ops3_Bounds0_1000",
298
+ "Max_Ops4_Bounds-1000_1000",
299
+ "Max_Ops4_Bounds-100_100",
300
+ "Max_Ops4_Bounds0_100",
301
+ "Max_Ops4_Bounds0_1000",
302
+ "Max_Ops5_Bounds-1000_1000",
303
+ "Max_Ops5_Bounds-100_100",
304
+ "Max_Ops5_Bounds0_100",
305
+ "Max_Ops5_Bounds0_1000",
306
+ "Max_Ops6_Bounds-1000_1000",
307
+ "Max_Ops6_Bounds-100_100",
308
+ "Max_Ops6_Bounds0_100",
309
+ "Max_Ops6_Bounds0_1000",
310
+ "Max_Ops7_Bounds-1000_1000",
311
+ "Max_Ops7_Bounds-100_100",
312
+ "Max_Ops7_Bounds0_100",
313
+ "Max_Ops7_Bounds0_1000",
314
+ "Max_Ops8_Bounds-1000_1000",
315
+ "Max_Ops8_Bounds-100_100",
316
+ "Max_Ops8_Bounds0_100",
317
+ "Max_Ops8_Bounds0_1000",
318
+ "Max_Ops9_Bounds-1000_1000",
319
+ "Max_Ops9_Bounds-100_100",
320
+ "Max_Ops9_Bounds0_100",
321
+ "Max_Ops9_Bounds0_1000",
322
+ "Max_Ops10_Bounds-1000_1000",
323
+ "Max_Ops10_Bounds-100_100",
324
+ "Max_Ops10_Bounds0_100",
325
+ "Max_Ops10_Bounds0_1000"
326
+ ]
327
+ },
328
+ "train": "data/Arithmetic/Curriculum_Hard",
329
+ "val": "data/Arithmetic/Curriculum_Hard",
330
+ "test": "data/Arithmetic/Curriculum_Hard",
331
+ "filling_field": [
332
+ "Question",
333
+ "Answer"
334
+ ]
335
+ },
336
+ "Arithmetic_Hard_prompt_C11": {
337
+ "type": "list-like",
338
+ "dataset_purpose": "downstream",
339
+ "attributes": {
340
+ "subjects": [
341
+ 1,
342
+ 2,
343
+ 3,
344
+ 4,
345
+ 5,
346
+ 6,
347
+ 7,
348
+ 8,
349
+ 9
350
+ ],
351
+ "lessons": [
352
+ "Max_Ops1_Bounds-1000_1000",
353
+ "Max_Ops1_Bounds-100_100",
354
+ "Max_Ops1_Bounds0_100",
355
+ "Max_Ops1_Bounds0_1000",
356
+ "Max_Ops2_Bounds-1000_1000",
357
+ "Max_Ops2_Bounds-100_100",
358
+ "Max_Ops2_Bounds0_100",
359
+ "Max_Ops2_Bounds0_1000",
360
+ "Max_Ops3_Bounds-1000_1000",
361
+ "Max_Ops3_Bounds-100_100",
362
+ "Max_Ops3_Bounds0_100",
363
+ "Max_Ops3_Bounds0_1000",
364
+ "Max_Ops4_Bounds-1000_1000",
365
+ "Max_Ops4_Bounds-100_100",
366
+ "Max_Ops4_Bounds0_100",
367
+ "Max_Ops4_Bounds0_1000",
368
+ "Max_Ops5_Bounds-1000_1000",
369
+ "Max_Ops5_Bounds-100_100",
370
+ "Max_Ops5_Bounds0_100",
371
+ "Max_Ops5_Bounds0_1000",
372
+ "Max_Ops6_Bounds-1000_1000",
373
+ "Max_Ops6_Bounds-100_100",
374
+ "Max_Ops6_Bounds0_100",
375
+ "Max_Ops6_Bounds0_1000",
376
+ "Max_Ops7_Bounds-1000_1000",
377
+ "Max_Ops7_Bounds-100_100",
378
+ "Max_Ops7_Bounds0_100",
379
+ "Max_Ops7_Bounds0_1000",
380
+ "Max_Ops8_Bounds-1000_1000",
381
+ "Max_Ops8_Bounds-100_100",
382
+ "Max_Ops8_Bounds0_100",
383
+ "Max_Ops8_Bounds0_1000",
384
+ "Max_Ops9_Bounds-1000_1000",
385
+ "Max_Ops9_Bounds-100_100",
386
+ "Max_Ops9_Bounds0_100",
387
+ "Max_Ops9_Bounds0_1000",
388
+ "Max_Ops10_Bounds-1000_1000",
389
+ "Max_Ops10_Bounds-100_100",
390
+ "Max_Ops10_Bounds0_100",
391
+ "Max_Ops10_Bounds0_1000"
392
+ ]
393
+ },
394
+ "train": "data/Arithmetic/Curriculum_Hard",
395
+ "val": "data/Arithmetic/Curriculum_Hard",
396
+ "test": "data/Arithmetic/Curriculum_Hard",
397
+ "filling_field": [
398
+ "Question",
399
+ "Answer"
400
+ ]
401
+ },
402
+ "Arithmetic_Hard_prompt_C12": {
403
+ "type": "list-like",
404
+ "dataset_purpose": "downstream",
405
+ "attributes": {
406
+ "subjects": [
407
+ 1,
408
+ 2,
409
+ 3,
410
+ 4
411
+ ],
412
+ "lessons": [
413
+ "Max_Ops1_Bounds-1000_1000",
414
+ "Max_Ops1_Bounds-100_100",
415
+ "Max_Ops1_Bounds0_100",
416
+ "Max_Ops1_Bounds0_1000",
417
+ "Max_Ops2_Bounds-1000_1000",
418
+ "Max_Ops2_Bounds-100_100",
419
+ "Max_Ops2_Bounds0_100",
420
+ "Max_Ops2_Bounds0_1000",
421
+ "Max_Ops3_Bounds-1000_1000",
422
+ "Max_Ops3_Bounds-100_100",
423
+ "Max_Ops3_Bounds0_100",
424
+ "Max_Ops3_Bounds0_1000",
425
+ "Max_Ops4_Bounds-1000_1000",
426
+ "Max_Ops4_Bounds-100_100",
427
+ "Max_Ops4_Bounds0_100",
428
+ "Max_Ops4_Bounds0_1000",
429
+ "Max_Ops5_Bounds-1000_1000",
430
+ "Max_Ops5_Bounds-100_100",
431
+ "Max_Ops5_Bounds0_100",
432
+ "Max_Ops5_Bounds0_1000",
433
+ "Max_Ops6_Bounds-1000_1000",
434
+ "Max_Ops6_Bounds-100_100",
435
+ "Max_Ops6_Bounds0_100",
436
+ "Max_Ops6_Bounds0_1000",
437
+ "Max_Ops7_Bounds-1000_1000",
438
+ "Max_Ops7_Bounds-100_100",
439
+ "Max_Ops7_Bounds0_100",
440
+ "Max_Ops7_Bounds0_1000",
441
+ "Max_Ops8_Bounds-1000_1000",
442
+ "Max_Ops8_Bounds-100_100",
443
+ "Max_Ops8_Bounds0_100",
444
+ "Max_Ops8_Bounds0_1000",
445
+ "Max_Ops9_Bounds-1000_1000",
446
+ "Max_Ops9_Bounds-100_100",
447
+ "Max_Ops9_Bounds0_100",
448
+ "Max_Ops9_Bounds0_1000",
449
+ "Max_Ops10_Bounds-1000_1000",
450
+ "Max_Ops10_Bounds-100_100",
451
+ "Max_Ops10_Bounds0_100",
452
+ "Max_Ops10_Bounds0_1000"
453
+ ]
454
+ },
455
+ "train": "data/Arithmetic/Curriculum_Hard",
456
+ "val": "data/Arithmetic/Curriculum_Hard",
457
+ "test": "data/Arithmetic/Curriculum_Hard",
458
+ "filling_field": [
459
+ "Question",
460
+ "Answer"
461
+ ]
462
+ },
463
+ "Arithmetic_XHard": {
464
+ "type": "list-like",
465
+ "dataset_purpose": "downstream",
466
+ "attributes": {
467
+ "subjects": [
468
+ 1,
469
+ 2,
470
+ 3,
471
+ 4,
472
+ 5,
473
+ 6,
474
+ 7,
475
+ 8,
476
+ 9
477
+ ],
478
+ "lessons": [
479
+ "Max_Ops10_Bounds0_10000.json",
480
+ "Max_Ops10_Bounds0_1000.json",
481
+ "Max_Ops10_Bounds-10000_10000.json",
482
+ "Max_Ops10_Bounds-1000_1000.json",
483
+ "Max_Ops11_Bounds0_10000.json",
484
+ "Max_Ops11_Bounds0_1000.json",
485
+ "Max_Ops11_Bounds-10000_10000.json",
486
+ "Max_Ops11_Bounds-1000_1000.json",
487
+ "Max_Ops12_Bounds0_10000.json",
488
+ "Max_Ops12_Bounds0_1000.json",
489
+ "Max_Ops12_Bounds-10000_10000.json",
490
+ "Max_Ops12_Bounds-1000_1000.json",
491
+ "Max_Ops13_Bounds0_10000.json",
492
+ "Max_Ops13_Bounds0_1000.json",
493
+ "Max_Ops13_Bounds-10000_10000.json",
494
+ "Max_Ops13_Bounds-1000_1000.json",
495
+ "Max_Ops14_Bounds0_10000.json",
496
+ "Max_Ops14_Bounds0_1000.json",
497
+ "Max_Ops14_Bounds-10000_10000.json",
498
+ "Max_Ops14_Bounds-1000_1000.json",
499
+ "Max_Ops15_Bounds0_10000.json",
500
+ "Max_Ops15_Bounds0_1000.json",
501
+ "Max_Ops15_Bounds-10000_10000.json",
502
+ "Max_Ops15_Bounds-1000_1000.json",
503
+ "Max_Ops16_Bounds0_10000.json",
504
+ "Max_Ops16_Bounds0_1000.json",
505
+ "Max_Ops16_Bounds-10000_10000.json",
506
+ "Max_Ops16_Bounds-1000_1000.json",
507
+ "Max_Ops17_Bounds0_10000.json",
508
+ "Max_Ops17_Bounds0_1000.json",
509
+ "Max_Ops17_Bounds-10000_10000.json",
510
+ "Max_Ops17_Bounds-1000_1000.json",
511
+ "Max_Ops18_Bounds0_10000.json",
512
+ "Max_Ops18_Bounds0_1000.json",
513
+ "Max_Ops18_Bounds-10000_10000.json",
514
+ "Max_Ops18_Bounds-1000_1000.json",
515
+ "Max_Ops19_Bounds0_10000.json",
516
+ "Max_Ops19_Bounds0_1000.json",
517
+ "Max_Ops19_Bounds-10000_10000.json",
518
+ "Max_Ops19_Bounds-1000_1000.json",
519
+ "Max_Ops1_Bounds0_10000.json",
520
+ "Max_Ops1_Bounds0_1000.json",
521
+ "Max_Ops1_Bounds-10000_10000.json",
522
+ "Max_Ops1_Bounds-1000_1000.json",
523
+ "Max_Ops20_Bounds0_10000.json",
524
+ "Max_Ops20_Bounds0_1000.json",
525
+ "Max_Ops20_Bounds-10000_10000.json",
526
+ "Max_Ops20_Bounds-1000_1000.json",
527
+ "Max_Ops2_Bounds0_10000.json",
528
+ "Max_Ops2_Bounds0_1000.json",
529
+ "Max_Ops2_Bounds-10000_10000.json",
530
+ "Max_Ops2_Bounds-1000_1000.json",
531
+ "Max_Ops3_Bounds0_10000.json",
532
+ "Max_Ops3_Bounds0_1000.json",
533
+ "Max_Ops3_Bounds-10000_10000.json",
534
+ "Max_Ops3_Bounds-1000_1000.json",
535
+ "Max_Ops4_Bounds0_10000.json",
536
+ "Max_Ops4_Bounds0_1000.json",
537
+ "Max_Ops4_Bounds-10000_10000.json",
538
+ "Max_Ops4_Bounds-1000_1000.json",
539
+ "Max_Ops5_Bounds0_10000.json",
540
+ "Max_Ops5_Bounds0_1000.json",
541
+ "Max_Ops5_Bounds-10000_10000.json",
542
+ "Max_Ops5_Bounds-1000_1000.json",
543
+ "Max_Ops6_Bounds0_10000.json",
544
+ "Max_Ops6_Bounds0_1000.json",
545
+ "Max_Ops6_Bounds-10000_10000.json",
546
+ "Max_Ops6_Bounds-1000_1000.json",
547
+ "Max_Ops7_Bounds0_10000.json",
548
+ "Max_Ops7_Bounds0_1000.json",
549
+ "Max_Ops7_Bounds-10000_10000.json",
550
+ "Max_Ops7_Bounds-1000_1000.json",
551
+ "Max_Ops8_Bounds0_10000.json",
552
+ "Max_Ops8_Bounds0_1000.json",
553
+ "Max_Ops8_Bounds-10000_10000.json",
554
+ "Max_Ops8_Bounds-1000_1000.json",
555
+ "Max_Ops9_Bounds0_10000.json",
556
+ "Max_Ops9_Bounds0_1000.json",
557
+ "Max_Ops9_Bounds-10000_10000.json",
558
+ "Max_Ops9_Bounds-1000_1000.json"
559
+ ]
560
+ },
561
+ "train": "data/Arithmetic/Curriculum_XHard",
562
+ "val": "data/Arithmetic/Curriculum_XHard",
563
+ "test": "data/Arithmetic/Curriculum_XHard",
564
+ "filling_field": [
565
+ "Question",
566
+ "Answer"
567
+ ]
568
+ },
569
+ "GSM8K": {
570
+ "type": "local",
571
+ "dataset_purpose": "downstream",
572
+ "train_file": "data/GSM8K/GSM8K_train.json",
573
+ "val_file": "data/GSM8K/GSM8K_test.json",
574
+ "test_file": "data/GSM8K/GSM8K_dev.json",
575
+ "filling_field": [
576
+ "Body",
577
+ "Question",
578
+ "Answer"
579
+ ]
580
+ },
581
+ "APPS": {
582
+ "type": "local",
583
+ "dataset_purpose": "downstream",
584
+ "train_file": "data/APPS/apps_train.json",
585
+ "val_file": "data/APPS/apps_test.json",
586
+ "test_file": "data/APPS/apps_dev.json",
587
+ "filling_field": [
588
+ "Body",
589
+ "Question",
590
+ "Answer"
591
+ ]
592
+ },
593
+ "ghcode_python": {
594
+ "type": "huggingface",
595
+ "dataset_purpose": "pretrain",
596
+ "name": "slseanwu/ghcode_python_split_700k",
597
+ "max_eval_size": 1000,
598
+ "max_train_size": 160000,
599
+ "filling_field": [
600
+ "code"
601
+ ]
602
+ }
603
+ }
604
+ }
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7dbf3c56b55bacde1773bb6b7271e30823177a28ab1b1a0ab88b4d8b5a78d8e8
3
+ size 1064
trainer_state.json ADDED
@@ -0,0 +1,1095 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 3.0,
5
+ "eval_steps": 5000,
6
+ "global_step": 15000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.02,
13
+ "grad_norm": 1.0555673837661743,
14
+ "learning_rate": 1.9866666666666667e-05,
15
+ "loss": 0.9803,
16
+ "step": 100
17
+ },
18
+ {
19
+ "epoch": 0.04,
20
+ "grad_norm": 1.0609339475631714,
21
+ "learning_rate": 1.9733333333333336e-05,
22
+ "loss": 0.9477,
23
+ "step": 200
24
+ },
25
+ {
26
+ "epoch": 0.06,
27
+ "grad_norm": 0.8154662251472473,
28
+ "learning_rate": 1.9600000000000002e-05,
29
+ "loss": 0.9472,
30
+ "step": 300
31
+ },
32
+ {
33
+ "epoch": 0.08,
34
+ "grad_norm": 0.8507415652275085,
35
+ "learning_rate": 1.9466666666666668e-05,
36
+ "loss": 0.9496,
37
+ "step": 400
38
+ },
39
+ {
40
+ "epoch": 0.1,
41
+ "grad_norm": 0.9045298099517822,
42
+ "learning_rate": 1.9333333333333333e-05,
43
+ "loss": 0.9416,
44
+ "step": 500
45
+ },
46
+ {
47
+ "epoch": 0.12,
48
+ "grad_norm": 1.2995306253433228,
49
+ "learning_rate": 1.9200000000000003e-05,
50
+ "loss": 0.9501,
51
+ "step": 600
52
+ },
53
+ {
54
+ "epoch": 0.14,
55
+ "grad_norm": 0.8860793709754944,
56
+ "learning_rate": 1.9066666666666668e-05,
57
+ "loss": 0.929,
58
+ "step": 700
59
+ },
60
+ {
61
+ "epoch": 0.16,
62
+ "grad_norm": 0.8791409134864807,
63
+ "learning_rate": 1.8933333333333334e-05,
64
+ "loss": 0.9323,
65
+ "step": 800
66
+ },
67
+ {
68
+ "epoch": 0.18,
69
+ "grad_norm": 0.868554949760437,
70
+ "learning_rate": 1.88e-05,
71
+ "loss": 0.9252,
72
+ "step": 900
73
+ },
74
+ {
75
+ "epoch": 0.2,
76
+ "grad_norm": 0.7996535301208496,
77
+ "learning_rate": 1.866666666666667e-05,
78
+ "loss": 0.916,
79
+ "step": 1000
80
+ },
81
+ {
82
+ "epoch": 0.22,
83
+ "grad_norm": 0.9991319179534912,
84
+ "learning_rate": 1.8533333333333334e-05,
85
+ "loss": 0.9267,
86
+ "step": 1100
87
+ },
88
+ {
89
+ "epoch": 0.24,
90
+ "grad_norm": 1.0026607513427734,
91
+ "learning_rate": 1.8400000000000003e-05,
92
+ "loss": 0.9281,
93
+ "step": 1200
94
+ },
95
+ {
96
+ "epoch": 0.26,
97
+ "grad_norm": 0.860996663570404,
98
+ "learning_rate": 1.826666666666667e-05,
99
+ "loss": 0.9272,
100
+ "step": 1300
101
+ },
102
+ {
103
+ "epoch": 0.28,
104
+ "grad_norm": 0.8613474369049072,
105
+ "learning_rate": 1.8133333333333335e-05,
106
+ "loss": 0.9277,
107
+ "step": 1400
108
+ },
109
+ {
110
+ "epoch": 0.3,
111
+ "grad_norm": 0.8491615653038025,
112
+ "learning_rate": 1.8e-05,
113
+ "loss": 0.9293,
114
+ "step": 1500
115
+ },
116
+ {
117
+ "epoch": 0.32,
118
+ "grad_norm": 0.9264828562736511,
119
+ "learning_rate": 1.7866666666666666e-05,
120
+ "loss": 0.917,
121
+ "step": 1600
122
+ },
123
+ {
124
+ "epoch": 0.34,
125
+ "grad_norm": 0.8633733987808228,
126
+ "learning_rate": 1.7733333333333335e-05,
127
+ "loss": 0.9325,
128
+ "step": 1700
129
+ },
130
+ {
131
+ "epoch": 0.36,
132
+ "grad_norm": 0.8906639218330383,
133
+ "learning_rate": 1.76e-05,
134
+ "loss": 0.9126,
135
+ "step": 1800
136
+ },
137
+ {
138
+ "epoch": 0.38,
139
+ "grad_norm": 0.885764479637146,
140
+ "learning_rate": 1.7466666666666667e-05,
141
+ "loss": 0.9065,
142
+ "step": 1900
143
+ },
144
+ {
145
+ "epoch": 0.4,
146
+ "grad_norm": 0.8839893937110901,
147
+ "learning_rate": 1.7333333333333336e-05,
148
+ "loss": 0.9429,
149
+ "step": 2000
150
+ },
151
+ {
152
+ "epoch": 0.42,
153
+ "grad_norm": 0.913644552230835,
154
+ "learning_rate": 1.72e-05,
155
+ "loss": 0.9056,
156
+ "step": 2100
157
+ },
158
+ {
159
+ "epoch": 0.44,
160
+ "grad_norm": 0.8399360179901123,
161
+ "learning_rate": 1.706666666666667e-05,
162
+ "loss": 0.913,
163
+ "step": 2200
164
+ },
165
+ {
166
+ "epoch": 0.46,
167
+ "grad_norm": 0.8176397681236267,
168
+ "learning_rate": 1.6933333333333336e-05,
169
+ "loss": 0.9192,
170
+ "step": 2300
171
+ },
172
+ {
173
+ "epoch": 0.48,
174
+ "grad_norm": 0.8771522045135498,
175
+ "learning_rate": 1.6800000000000002e-05,
176
+ "loss": 0.9222,
177
+ "step": 2400
178
+ },
179
+ {
180
+ "epoch": 0.5,
181
+ "grad_norm": 0.7874905467033386,
182
+ "learning_rate": 1.6666666666666667e-05,
183
+ "loss": 0.9197,
184
+ "step": 2500
185
+ },
186
+ {
187
+ "epoch": 0.52,
188
+ "grad_norm": 0.8051652312278748,
189
+ "learning_rate": 1.6533333333333333e-05,
190
+ "loss": 0.91,
191
+ "step": 2600
192
+ },
193
+ {
194
+ "epoch": 0.54,
195
+ "grad_norm": 0.80189049243927,
196
+ "learning_rate": 1.64e-05,
197
+ "loss": 0.9091,
198
+ "step": 2700
199
+ },
200
+ {
201
+ "epoch": 0.56,
202
+ "grad_norm": 0.7678588032722473,
203
+ "learning_rate": 1.6266666666666668e-05,
204
+ "loss": 0.9111,
205
+ "step": 2800
206
+ },
207
+ {
208
+ "epoch": 0.58,
209
+ "grad_norm": 0.7835684418678284,
210
+ "learning_rate": 1.6133333333333334e-05,
211
+ "loss": 0.9127,
212
+ "step": 2900
213
+ },
214
+ {
215
+ "epoch": 0.6,
216
+ "grad_norm": 0.8735753297805786,
217
+ "learning_rate": 1.6000000000000003e-05,
218
+ "loss": 0.9038,
219
+ "step": 3000
220
+ },
221
+ {
222
+ "epoch": 0.62,
223
+ "grad_norm": 0.8635367155075073,
224
+ "learning_rate": 1.586666666666667e-05,
225
+ "loss": 0.9148,
226
+ "step": 3100
227
+ },
228
+ {
229
+ "epoch": 0.64,
230
+ "grad_norm": 0.8477209210395813,
231
+ "learning_rate": 1.5733333333333334e-05,
232
+ "loss": 0.9123,
233
+ "step": 3200
234
+ },
235
+ {
236
+ "epoch": 0.66,
237
+ "grad_norm": 0.7765554785728455,
238
+ "learning_rate": 1.5600000000000003e-05,
239
+ "loss": 0.8898,
240
+ "step": 3300
241
+ },
242
+ {
243
+ "epoch": 0.68,
244
+ "grad_norm": 0.891146183013916,
245
+ "learning_rate": 1.546666666666667e-05,
246
+ "loss": 0.8992,
247
+ "step": 3400
248
+ },
249
+ {
250
+ "epoch": 0.7,
251
+ "grad_norm": 0.8695024251937866,
252
+ "learning_rate": 1.5333333333333334e-05,
253
+ "loss": 0.9114,
254
+ "step": 3500
255
+ },
256
+ {
257
+ "epoch": 0.72,
258
+ "grad_norm": 0.927379846572876,
259
+ "learning_rate": 1.5200000000000002e-05,
260
+ "loss": 0.89,
261
+ "step": 3600
262
+ },
263
+ {
264
+ "epoch": 0.74,
265
+ "grad_norm": 0.885179340839386,
266
+ "learning_rate": 1.5066666666666668e-05,
267
+ "loss": 0.9159,
268
+ "step": 3700
269
+ },
270
+ {
271
+ "epoch": 0.76,
272
+ "grad_norm": 0.9603422284126282,
273
+ "learning_rate": 1.4933333333333335e-05,
274
+ "loss": 0.9115,
275
+ "step": 3800
276
+ },
277
+ {
278
+ "epoch": 0.78,
279
+ "grad_norm": 0.8624181151390076,
280
+ "learning_rate": 1.48e-05,
281
+ "loss": 0.9025,
282
+ "step": 3900
283
+ },
284
+ {
285
+ "epoch": 0.8,
286
+ "grad_norm": 0.9430868625640869,
287
+ "learning_rate": 1.4666666666666666e-05,
288
+ "loss": 0.8984,
289
+ "step": 4000
290
+ },
291
+ {
292
+ "epoch": 0.82,
293
+ "grad_norm": 0.7860404849052429,
294
+ "learning_rate": 1.4533333333333335e-05,
295
+ "loss": 0.8887,
296
+ "step": 4100
297
+ },
298
+ {
299
+ "epoch": 0.84,
300
+ "grad_norm": 0.793118417263031,
301
+ "learning_rate": 1.4400000000000001e-05,
302
+ "loss": 0.9097,
303
+ "step": 4200
304
+ },
305
+ {
306
+ "epoch": 0.86,
307
+ "grad_norm": 0.8741154074668884,
308
+ "learning_rate": 1.4266666666666668e-05,
309
+ "loss": 0.8919,
310
+ "step": 4300
311
+ },
312
+ {
313
+ "epoch": 0.88,
314
+ "grad_norm": 0.7793194651603699,
315
+ "learning_rate": 1.4133333333333334e-05,
316
+ "loss": 0.8927,
317
+ "step": 4400
318
+ },
319
+ {
320
+ "epoch": 0.9,
321
+ "grad_norm": 0.7923697233200073,
322
+ "learning_rate": 1.4e-05,
323
+ "loss": 0.9032,
324
+ "step": 4500
325
+ },
326
+ {
327
+ "epoch": 0.92,
328
+ "grad_norm": 0.8406550288200378,
329
+ "learning_rate": 1.3866666666666669e-05,
330
+ "loss": 0.8878,
331
+ "step": 4600
332
+ },
333
+ {
334
+ "epoch": 0.94,
335
+ "grad_norm": 1.0170931816101074,
336
+ "learning_rate": 1.3733333333333335e-05,
337
+ "loss": 0.9032,
338
+ "step": 4700
339
+ },
340
+ {
341
+ "epoch": 0.96,
342
+ "grad_norm": 0.8855489492416382,
343
+ "learning_rate": 1.3600000000000002e-05,
344
+ "loss": 0.9026,
345
+ "step": 4800
346
+ },
347
+ {
348
+ "epoch": 0.98,
349
+ "grad_norm": 0.7720271348953247,
350
+ "learning_rate": 1.3466666666666668e-05,
351
+ "loss": 0.8978,
352
+ "step": 4900
353
+ },
354
+ {
355
+ "epoch": 1.0,
356
+ "grad_norm": 0.841073751449585,
357
+ "learning_rate": 1.3333333333333333e-05,
358
+ "loss": 0.8958,
359
+ "step": 5000
360
+ },
361
+ {
362
+ "epoch": 1.0,
363
+ "eval_loss": 0.88493812084198,
364
+ "eval_runtime": 62.115,
365
+ "eval_samples_per_second": 16.099,
366
+ "eval_steps_per_second": 2.012,
367
+ "step": 5000
368
+ },
369
+ {
370
+ "epoch": 1.02,
371
+ "grad_norm": 0.856351375579834,
372
+ "learning_rate": 1.3200000000000002e-05,
373
+ "loss": 0.7886,
374
+ "step": 5100
375
+ },
376
+ {
377
+ "epoch": 1.04,
378
+ "grad_norm": 0.92889404296875,
379
+ "learning_rate": 1.3066666666666668e-05,
380
+ "loss": 0.7957,
381
+ "step": 5200
382
+ },
383
+ {
384
+ "epoch": 1.06,
385
+ "grad_norm": 0.8512794375419617,
386
+ "learning_rate": 1.2933333333333334e-05,
387
+ "loss": 0.7887,
388
+ "step": 5300
389
+ },
390
+ {
391
+ "epoch": 1.08,
392
+ "grad_norm": 0.8943263292312622,
393
+ "learning_rate": 1.2800000000000001e-05,
394
+ "loss": 0.8119,
395
+ "step": 5400
396
+ },
397
+ {
398
+ "epoch": 1.1,
399
+ "grad_norm": 0.908469557762146,
400
+ "learning_rate": 1.2666666666666667e-05,
401
+ "loss": 0.7958,
402
+ "step": 5500
403
+ },
404
+ {
405
+ "epoch": 1.12,
406
+ "grad_norm": 0.790229856967926,
407
+ "learning_rate": 1.2533333333333336e-05,
408
+ "loss": 0.7958,
409
+ "step": 5600
410
+ },
411
+ {
412
+ "epoch": 1.14,
413
+ "grad_norm": 1.0110950469970703,
414
+ "learning_rate": 1.2400000000000002e-05,
415
+ "loss": 0.7907,
416
+ "step": 5700
417
+ },
418
+ {
419
+ "epoch": 1.16,
420
+ "grad_norm": 0.9110943078994751,
421
+ "learning_rate": 1.2266666666666667e-05,
422
+ "loss": 0.7939,
423
+ "step": 5800
424
+ },
425
+ {
426
+ "epoch": 1.18,
427
+ "grad_norm": 0.8515949249267578,
428
+ "learning_rate": 1.2133333333333335e-05,
429
+ "loss": 0.7849,
430
+ "step": 5900
431
+ },
432
+ {
433
+ "epoch": 1.2,
434
+ "grad_norm": 0.8418060541152954,
435
+ "learning_rate": 1.2e-05,
436
+ "loss": 0.8036,
437
+ "step": 6000
438
+ },
439
+ {
440
+ "epoch": 1.22,
441
+ "grad_norm": 0.8303418159484863,
442
+ "learning_rate": 1.186666666666667e-05,
443
+ "loss": 0.8002,
444
+ "step": 6100
445
+ },
446
+ {
447
+ "epoch": 1.24,
448
+ "grad_norm": 0.8996633291244507,
449
+ "learning_rate": 1.1733333333333335e-05,
450
+ "loss": 0.8005,
451
+ "step": 6200
452
+ },
453
+ {
454
+ "epoch": 1.26,
455
+ "grad_norm": 0.8889008164405823,
456
+ "learning_rate": 1.16e-05,
457
+ "loss": 0.7956,
458
+ "step": 6300
459
+ },
460
+ {
461
+ "epoch": 1.28,
462
+ "grad_norm": 0.8718474507331848,
463
+ "learning_rate": 1.1466666666666668e-05,
464
+ "loss": 0.786,
465
+ "step": 6400
466
+ },
467
+ {
468
+ "epoch": 1.3,
469
+ "grad_norm": 0.931644856929779,
470
+ "learning_rate": 1.1333333333333334e-05,
471
+ "loss": 0.8017,
472
+ "step": 6500
473
+ },
474
+ {
475
+ "epoch": 1.32,
476
+ "grad_norm": 0.873427152633667,
477
+ "learning_rate": 1.1200000000000001e-05,
478
+ "loss": 0.7855,
479
+ "step": 6600
480
+ },
481
+ {
482
+ "epoch": 1.34,
483
+ "grad_norm": 0.8685030341148376,
484
+ "learning_rate": 1.1066666666666669e-05,
485
+ "loss": 0.802,
486
+ "step": 6700
487
+ },
488
+ {
489
+ "epoch": 1.36,
490
+ "grad_norm": 0.8744147419929504,
491
+ "learning_rate": 1.0933333333333334e-05,
492
+ "loss": 0.7936,
493
+ "step": 6800
494
+ },
495
+ {
496
+ "epoch": 1.38,
497
+ "grad_norm": 0.9263073205947876,
498
+ "learning_rate": 1.0800000000000002e-05,
499
+ "loss": 0.7864,
500
+ "step": 6900
501
+ },
502
+ {
503
+ "epoch": 1.4,
504
+ "grad_norm": 0.8322786688804626,
505
+ "learning_rate": 1.0666666666666667e-05,
506
+ "loss": 0.7886,
507
+ "step": 7000
508
+ },
509
+ {
510
+ "epoch": 1.42,
511
+ "grad_norm": 0.8734496235847473,
512
+ "learning_rate": 1.0533333333333333e-05,
513
+ "loss": 0.7889,
514
+ "step": 7100
515
+ },
516
+ {
517
+ "epoch": 1.44,
518
+ "grad_norm": 1.0558969974517822,
519
+ "learning_rate": 1.04e-05,
520
+ "loss": 0.8092,
521
+ "step": 7200
522
+ },
523
+ {
524
+ "epoch": 1.46,
525
+ "grad_norm": 0.8517795205116272,
526
+ "learning_rate": 1.0266666666666668e-05,
527
+ "loss": 0.8061,
528
+ "step": 7300
529
+ },
530
+ {
531
+ "epoch": 1.48,
532
+ "grad_norm": 0.8912076354026794,
533
+ "learning_rate": 1.0133333333333335e-05,
534
+ "loss": 0.8073,
535
+ "step": 7400
536
+ },
537
+ {
538
+ "epoch": 1.5,
539
+ "grad_norm": 0.8496484756469727,
540
+ "learning_rate": 1e-05,
541
+ "loss": 0.789,
542
+ "step": 7500
543
+ },
544
+ {
545
+ "epoch": 1.52,
546
+ "grad_norm": 0.8365699648857117,
547
+ "learning_rate": 9.866666666666668e-06,
548
+ "loss": 0.7813,
549
+ "step": 7600
550
+ },
551
+ {
552
+ "epoch": 1.54,
553
+ "grad_norm": 0.8612975478172302,
554
+ "learning_rate": 9.733333333333334e-06,
555
+ "loss": 0.7923,
556
+ "step": 7700
557
+ },
558
+ {
559
+ "epoch": 1.56,
560
+ "grad_norm": 0.825588047504425,
561
+ "learning_rate": 9.600000000000001e-06,
562
+ "loss": 0.8017,
563
+ "step": 7800
564
+ },
565
+ {
566
+ "epoch": 1.58,
567
+ "grad_norm": 0.9208545684814453,
568
+ "learning_rate": 9.466666666666667e-06,
569
+ "loss": 0.7975,
570
+ "step": 7900
571
+ },
572
+ {
573
+ "epoch": 1.6,
574
+ "grad_norm": 0.8390321731567383,
575
+ "learning_rate": 9.333333333333334e-06,
576
+ "loss": 0.8023,
577
+ "step": 8000
578
+ },
579
+ {
580
+ "epoch": 1.62,
581
+ "grad_norm": 0.9178739786148071,
582
+ "learning_rate": 9.200000000000002e-06,
583
+ "loss": 0.7965,
584
+ "step": 8100
585
+ },
586
+ {
587
+ "epoch": 1.64,
588
+ "grad_norm": 0.9349595904350281,
589
+ "learning_rate": 9.066666666666667e-06,
590
+ "loss": 0.7977,
591
+ "step": 8200
592
+ },
593
+ {
594
+ "epoch": 1.66,
595
+ "grad_norm": 0.8347993493080139,
596
+ "learning_rate": 8.933333333333333e-06,
597
+ "loss": 0.7898,
598
+ "step": 8300
599
+ },
600
+ {
601
+ "epoch": 1.68,
602
+ "grad_norm": 0.8217402696609497,
603
+ "learning_rate": 8.8e-06,
604
+ "loss": 0.7903,
605
+ "step": 8400
606
+ },
607
+ {
608
+ "epoch": 1.7,
609
+ "grad_norm": 0.8198782801628113,
610
+ "learning_rate": 8.666666666666668e-06,
611
+ "loss": 0.8032,
612
+ "step": 8500
613
+ },
614
+ {
615
+ "epoch": 1.72,
616
+ "grad_norm": 0.8704864382743835,
617
+ "learning_rate": 8.533333333333335e-06,
618
+ "loss": 0.7967,
619
+ "step": 8600
620
+ },
621
+ {
622
+ "epoch": 1.74,
623
+ "grad_norm": 0.892342746257782,
624
+ "learning_rate": 8.400000000000001e-06,
625
+ "loss": 0.7884,
626
+ "step": 8700
627
+ },
628
+ {
629
+ "epoch": 1.76,
630
+ "grad_norm": 0.9803333282470703,
631
+ "learning_rate": 8.266666666666667e-06,
632
+ "loss": 0.7859,
633
+ "step": 8800
634
+ },
635
+ {
636
+ "epoch": 1.78,
637
+ "grad_norm": 0.8991880416870117,
638
+ "learning_rate": 8.133333333333334e-06,
639
+ "loss": 0.7872,
640
+ "step": 8900
641
+ },
642
+ {
643
+ "epoch": 1.8,
644
+ "grad_norm": 1.0206116437911987,
645
+ "learning_rate": 8.000000000000001e-06,
646
+ "loss": 0.7967,
647
+ "step": 9000
648
+ },
649
+ {
650
+ "epoch": 1.82,
651
+ "grad_norm": 0.8060243725776672,
652
+ "learning_rate": 7.866666666666667e-06,
653
+ "loss": 0.7893,
654
+ "step": 9100
655
+ },
656
+ {
657
+ "epoch": 1.84,
658
+ "grad_norm": 0.8802576065063477,
659
+ "learning_rate": 7.733333333333334e-06,
660
+ "loss": 0.7985,
661
+ "step": 9200
662
+ },
663
+ {
664
+ "epoch": 1.86,
665
+ "grad_norm": 0.8549374341964722,
666
+ "learning_rate": 7.600000000000001e-06,
667
+ "loss": 0.7889,
668
+ "step": 9300
669
+ },
670
+ {
671
+ "epoch": 1.88,
672
+ "grad_norm": 0.9457976222038269,
673
+ "learning_rate": 7.4666666666666675e-06,
674
+ "loss": 0.8061,
675
+ "step": 9400
676
+ },
677
+ {
678
+ "epoch": 1.9,
679
+ "grad_norm": 0.8606815338134766,
680
+ "learning_rate": 7.333333333333333e-06,
681
+ "loss": 0.7866,
682
+ "step": 9500
683
+ },
684
+ {
685
+ "epoch": 1.92,
686
+ "grad_norm": 0.8577678799629211,
687
+ "learning_rate": 7.2000000000000005e-06,
688
+ "loss": 0.7951,
689
+ "step": 9600
690
+ },
691
+ {
692
+ "epoch": 1.94,
693
+ "grad_norm": 0.9072070717811584,
694
+ "learning_rate": 7.066666666666667e-06,
695
+ "loss": 0.7879,
696
+ "step": 9700
697
+ },
698
+ {
699
+ "epoch": 1.96,
700
+ "grad_norm": 0.8739129304885864,
701
+ "learning_rate": 6.9333333333333344e-06,
702
+ "loss": 0.7936,
703
+ "step": 9800
704
+ },
705
+ {
706
+ "epoch": 1.98,
707
+ "grad_norm": 0.8881368637084961,
708
+ "learning_rate": 6.800000000000001e-06,
709
+ "loss": 0.7825,
710
+ "step": 9900
711
+ },
712
+ {
713
+ "epoch": 2.0,
714
+ "grad_norm": 0.9448174834251404,
715
+ "learning_rate": 6.666666666666667e-06,
716
+ "loss": 0.7837,
717
+ "step": 10000
718
+ },
719
+ {
720
+ "epoch": 2.0,
721
+ "eval_loss": 0.8766704201698303,
722
+ "eval_runtime": 64.0882,
723
+ "eval_samples_per_second": 15.604,
724
+ "eval_steps_per_second": 1.95,
725
+ "step": 10000
726
+ },
727
+ {
728
+ "epoch": 2.02,
729
+ "grad_norm": 0.8892523050308228,
730
+ "learning_rate": 6.534666666666667e-06,
731
+ "loss": 0.7014,
732
+ "step": 10100
733
+ },
734
+ {
735
+ "epoch": 2.04,
736
+ "grad_norm": 0.9131534695625305,
737
+ "learning_rate": 6.4013333333333334e-06,
738
+ "loss": 0.6962,
739
+ "step": 10200
740
+ },
741
+ {
742
+ "epoch": 2.06,
743
+ "grad_norm": 0.9470012784004211,
744
+ "learning_rate": 6.268000000000001e-06,
745
+ "loss": 0.6874,
746
+ "step": 10300
747
+ },
748
+ {
749
+ "epoch": 2.08,
750
+ "grad_norm": 0.9785165190696716,
751
+ "learning_rate": 6.134666666666667e-06,
752
+ "loss": 0.7029,
753
+ "step": 10400
754
+ },
755
+ {
756
+ "epoch": 2.1,
757
+ "grad_norm": 1.0245338678359985,
758
+ "learning_rate": 6.001333333333334e-06,
759
+ "loss": 0.6922,
760
+ "step": 10500
761
+ },
762
+ {
763
+ "epoch": 2.12,
764
+ "grad_norm": 0.920975387096405,
765
+ "learning_rate": 5.868e-06,
766
+ "loss": 0.6973,
767
+ "step": 10600
768
+ },
769
+ {
770
+ "epoch": 2.14,
771
+ "grad_norm": 0.9391937851905823,
772
+ "learning_rate": 5.734666666666667e-06,
773
+ "loss": 0.6917,
774
+ "step": 10700
775
+ },
776
+ {
777
+ "epoch": 2.16,
778
+ "grad_norm": 0.9682599306106567,
779
+ "learning_rate": 5.601333333333334e-06,
780
+ "loss": 0.6965,
781
+ "step": 10800
782
+ },
783
+ {
784
+ "epoch": 2.18,
785
+ "grad_norm": 1.0669585466384888,
786
+ "learning_rate": 5.468e-06,
787
+ "loss": 0.698,
788
+ "step": 10900
789
+ },
790
+ {
791
+ "epoch": 2.2,
792
+ "grad_norm": 1.0977643728256226,
793
+ "learning_rate": 5.3346666666666665e-06,
794
+ "loss": 0.6961,
795
+ "step": 11000
796
+ },
797
+ {
798
+ "epoch": 2.22,
799
+ "grad_norm": 0.9608291387557983,
800
+ "learning_rate": 5.201333333333334e-06,
801
+ "loss": 0.6935,
802
+ "step": 11100
803
+ },
804
+ {
805
+ "epoch": 2.24,
806
+ "grad_norm": 0.9867018461227417,
807
+ "learning_rate": 5.069333333333334e-06,
808
+ "loss": 0.691,
809
+ "step": 11200
810
+ },
811
+ {
812
+ "epoch": 2.26,
813
+ "grad_norm": 0.9778249263763428,
814
+ "learning_rate": 4.936e-06,
815
+ "loss": 0.6973,
816
+ "step": 11300
817
+ },
818
+ {
819
+ "epoch": 2.28,
820
+ "grad_norm": 1.040911316871643,
821
+ "learning_rate": 4.802666666666667e-06,
822
+ "loss": 0.6958,
823
+ "step": 11400
824
+ },
825
+ {
826
+ "epoch": 2.3,
827
+ "grad_norm": 0.948652446269989,
828
+ "learning_rate": 4.669333333333334e-06,
829
+ "loss": 0.7075,
830
+ "step": 11500
831
+ },
832
+ {
833
+ "epoch": 2.32,
834
+ "grad_norm": 1.045653223991394,
835
+ "learning_rate": 4.536e-06,
836
+ "loss": 0.6966,
837
+ "step": 11600
838
+ },
839
+ {
840
+ "epoch": 2.34,
841
+ "grad_norm": 0.9575178623199463,
842
+ "learning_rate": 4.4040000000000005e-06,
843
+ "loss": 0.7036,
844
+ "step": 11700
845
+ },
846
+ {
847
+ "epoch": 2.36,
848
+ "grad_norm": 0.8968670964241028,
849
+ "learning_rate": 4.270666666666667e-06,
850
+ "loss": 0.7038,
851
+ "step": 11800
852
+ },
853
+ {
854
+ "epoch": 2.38,
855
+ "grad_norm": 0.9445046186447144,
856
+ "learning_rate": 4.137333333333334e-06,
857
+ "loss": 0.687,
858
+ "step": 11900
859
+ },
860
+ {
861
+ "epoch": 2.4,
862
+ "grad_norm": 1.0018490552902222,
863
+ "learning_rate": 4.004e-06,
864
+ "loss": 0.6978,
865
+ "step": 12000
866
+ },
867
+ {
868
+ "epoch": 2.42,
869
+ "grad_norm": 0.9981265664100647,
870
+ "learning_rate": 3.870666666666667e-06,
871
+ "loss": 0.6876,
872
+ "step": 12100
873
+ },
874
+ {
875
+ "epoch": 2.44,
876
+ "grad_norm": 0.9957240223884583,
877
+ "learning_rate": 3.737333333333333e-06,
878
+ "loss": 0.691,
879
+ "step": 12200
880
+ },
881
+ {
882
+ "epoch": 2.46,
883
+ "grad_norm": 1.0652461051940918,
884
+ "learning_rate": 3.604e-06,
885
+ "loss": 0.7026,
886
+ "step": 12300
887
+ },
888
+ {
889
+ "epoch": 2.48,
890
+ "grad_norm": 0.9067023992538452,
891
+ "learning_rate": 3.470666666666667e-06,
892
+ "loss": 0.6918,
893
+ "step": 12400
894
+ },
895
+ {
896
+ "epoch": 2.5,
897
+ "grad_norm": 0.997002899646759,
898
+ "learning_rate": 3.3373333333333336e-06,
899
+ "loss": 0.7013,
900
+ "step": 12500
901
+ },
902
+ {
903
+ "epoch": 2.52,
904
+ "grad_norm": 0.914594829082489,
905
+ "learning_rate": 3.2040000000000006e-06,
906
+ "loss": 0.7025,
907
+ "step": 12600
908
+ },
909
+ {
910
+ "epoch": 2.54,
911
+ "grad_norm": 1.197375774383545,
912
+ "learning_rate": 3.0706666666666667e-06,
913
+ "loss": 0.7053,
914
+ "step": 12700
915
+ },
916
+ {
917
+ "epoch": 2.56,
918
+ "grad_norm": 1.0545796155929565,
919
+ "learning_rate": 2.9373333333333336e-06,
920
+ "loss": 0.6888,
921
+ "step": 12800
922
+ },
923
+ {
924
+ "epoch": 2.58,
925
+ "grad_norm": 1.0479904413223267,
926
+ "learning_rate": 2.804e-06,
927
+ "loss": 0.6853,
928
+ "step": 12900
929
+ },
930
+ {
931
+ "epoch": 2.6,
932
+ "grad_norm": 0.9415286183357239,
933
+ "learning_rate": 2.670666666666667e-06,
934
+ "loss": 0.7053,
935
+ "step": 13000
936
+ },
937
+ {
938
+ "epoch": 2.62,
939
+ "grad_norm": 0.9765642881393433,
940
+ "learning_rate": 2.5373333333333332e-06,
941
+ "loss": 0.7093,
942
+ "step": 13100
943
+ },
944
+ {
945
+ "epoch": 2.64,
946
+ "grad_norm": 0.9646571278572083,
947
+ "learning_rate": 2.404e-06,
948
+ "loss": 0.7026,
949
+ "step": 13200
950
+ },
951
+ {
952
+ "epoch": 2.66,
953
+ "grad_norm": 0.9278367757797241,
954
+ "learning_rate": 2.2706666666666667e-06,
955
+ "loss": 0.6967,
956
+ "step": 13300
957
+ },
958
+ {
959
+ "epoch": 2.68,
960
+ "grad_norm": 0.9418883323669434,
961
+ "learning_rate": 2.1373333333333337e-06,
962
+ "loss": 0.699,
963
+ "step": 13400
964
+ },
965
+ {
966
+ "epoch": 2.7,
967
+ "grad_norm": 1.0113035440444946,
968
+ "learning_rate": 2.004e-06,
969
+ "loss": 0.6937,
970
+ "step": 13500
971
+ },
972
+ {
973
+ "epoch": 2.72,
974
+ "grad_norm": 1.0150426626205444,
975
+ "learning_rate": 1.8706666666666667e-06,
976
+ "loss": 0.6795,
977
+ "step": 13600
978
+ },
979
+ {
980
+ "epoch": 2.74,
981
+ "grad_norm": 1.177462100982666,
982
+ "learning_rate": 1.7373333333333333e-06,
983
+ "loss": 0.6822,
984
+ "step": 13700
985
+ },
986
+ {
987
+ "epoch": 2.76,
988
+ "grad_norm": 1.1305912733078003,
989
+ "learning_rate": 1.604e-06,
990
+ "loss": 0.7039,
991
+ "step": 13800
992
+ },
993
+ {
994
+ "epoch": 2.78,
995
+ "grad_norm": 0.9975621700286865,
996
+ "learning_rate": 1.470666666666667e-06,
997
+ "loss": 0.6864,
998
+ "step": 13900
999
+ },
1000
+ {
1001
+ "epoch": 2.8,
1002
+ "grad_norm": 1.0691418647766113,
1003
+ "learning_rate": 1.3373333333333335e-06,
1004
+ "loss": 0.6945,
1005
+ "step": 14000
1006
+ },
1007
+ {
1008
+ "epoch": 2.82,
1009
+ "grad_norm": 0.988650381565094,
1010
+ "learning_rate": 1.204e-06,
1011
+ "loss": 0.6891,
1012
+ "step": 14100
1013
+ },
1014
+ {
1015
+ "epoch": 2.84,
1016
+ "grad_norm": 1.0204881429672241,
1017
+ "learning_rate": 1.0706666666666668e-06,
1018
+ "loss": 0.6873,
1019
+ "step": 14200
1020
+ },
1021
+ {
1022
+ "epoch": 2.86,
1023
+ "grad_norm": 1.0628246068954468,
1024
+ "learning_rate": 9.373333333333334e-07,
1025
+ "loss": 0.691,
1026
+ "step": 14300
1027
+ },
1028
+ {
1029
+ "epoch": 2.88,
1030
+ "grad_norm": 1.058432698249817,
1031
+ "learning_rate": 8.04e-07,
1032
+ "loss": 0.6938,
1033
+ "step": 14400
1034
+ },
1035
+ {
1036
+ "epoch": 2.9,
1037
+ "grad_norm": 1.0059112310409546,
1038
+ "learning_rate": 6.706666666666667e-07,
1039
+ "loss": 0.7113,
1040
+ "step": 14500
1041
+ },
1042
+ {
1043
+ "epoch": 2.92,
1044
+ "grad_norm": 0.963862955570221,
1045
+ "learning_rate": 5.373333333333334e-07,
1046
+ "loss": 0.6831,
1047
+ "step": 14600
1048
+ },
1049
+ {
1050
+ "epoch": 2.94,
1051
+ "grad_norm": 0.9393936395645142,
1052
+ "learning_rate": 4.04e-07,
1053
+ "loss": 0.7055,
1054
+ "step": 14700
1055
+ },
1056
+ {
1057
+ "epoch": 2.96,
1058
+ "grad_norm": 1.1199434995651245,
1059
+ "learning_rate": 2.706666666666667e-07,
1060
+ "loss": 0.6944,
1061
+ "step": 14800
1062
+ },
1063
+ {
1064
+ "epoch": 2.98,
1065
+ "grad_norm": 1.1279562711715698,
1066
+ "learning_rate": 1.3733333333333335e-07,
1067
+ "loss": 0.7033,
1068
+ "step": 14900
1069
+ },
1070
+ {
1071
+ "epoch": 3.0,
1072
+ "grad_norm": 0.9235104918479919,
1073
+ "learning_rate": 4e-09,
1074
+ "loss": 0.6902,
1075
+ "step": 15000
1076
+ },
1077
+ {
1078
+ "epoch": 3.0,
1079
+ "eval_loss": 0.8971832394599915,
1080
+ "eval_runtime": 53.8519,
1081
+ "eval_samples_per_second": 18.569,
1082
+ "eval_steps_per_second": 2.321,
1083
+ "step": 15000
1084
+ }
1085
+ ],
1086
+ "logging_steps": 100,
1087
+ "max_steps": 15000,
1088
+ "num_input_tokens_seen": 0,
1089
+ "num_train_epochs": 3,
1090
+ "save_steps": 5000,
1091
+ "total_flos": 1.88804379967488e+18,
1092
+ "train_batch_size": 8,
1093
+ "trial_name": null,
1094
+ "trial_params": null
1095
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3303ac2034c16851b9953b2116fac54a0666ba5d4a68472db2d7090bc1db3a4
3
+ size 4856
training_logs.txt ADDED
@@ -0,0 +1,323 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [train dset len] 160000
2
+ [valid dset len] 1000
3
+ /usr0/home/liangzel/anaconda3/envs/air2/lib/python3.11/site-packages/accelerate/accelerator.py:432:
4
+ FutureWarning: Passing the following arguments to `Accelerator` is deprecated
5
+ and will be removed in version 1.0 of Accelerate: dict_keys(['dispatch_batches',
6
+ 'split_batches', 'even_batches', 'use_seedable_sampler']). Please pass an
7
+ `accelerate.DataLoaderConfiguration` instead:
8
+ dataloader_config = DataLoaderConfiguration(dispatch_batches=None,
9
+ split_batches=False, even_batches=True, use_seedable_sampler=True)
10
+ warnings.warn(
11
+ {'loss': 0.9803, 'grad_norm': 1.0555673837661743, 'learning_rate':
12
+ 1.9866666666666667e-05, 'epoch': 0.02}
13
+ {'loss': 0.9477, 'grad_norm': 1.0609339475631714, 'learning_rate':
14
+ 1.9733333333333336e-05, 'epoch': 0.04}
15
+ {'loss': 0.9472, 'grad_norm': 0.8154662251472473, 'learning_rate':
16
+ 1.9600000000000002e-05, 'epoch': 0.06}
17
+ {'loss': 0.9496, 'grad_norm': 0.8507415652275085, 'learning_rate':
18
+ 1.9466666666666668e-05, 'epoch': 0.08}
19
+ {'loss': 0.9416, 'grad_norm': 0.9045298099517822, 'learning_rate':
20
+ 1.9333333333333333e-05, 'epoch': 0.1}
21
+ {'loss': 0.9501, 'grad_norm': 1.2995306253433228, 'learning_rate':
22
+ 1.9200000000000003e-05, 'epoch': 0.12}
23
+ {'loss': 0.929, 'grad_norm': 0.8860793709754944, 'learning_rate':
24
+ 1.9066666666666668e-05, 'epoch': 0.14}
25
+ {'loss': 0.9323, 'grad_norm': 0.8791409134864807, 'learning_rate':
26
+ 1.8933333333333334e-05, 'epoch': 0.16}
27
+ {'loss': 0.9252, 'grad_norm': 0.868554949760437, 'learning_rate': 1.88e-05,
28
+ 'epoch': 0.18}
29
+ {'loss': 0.916, 'grad_norm': 0.7996535301208496, 'learning_rate':
30
+ 1.866666666666667e-05, 'epoch': 0.2}
31
+ {'loss': 0.9267, 'grad_norm': 0.9991319179534912, 'learning_rate':
32
+ 1.8533333333333334e-05, 'epoch': 0.22}
33
+ {'loss': 0.9281, 'grad_norm': 1.0026607513427734, 'learning_rate':
34
+ 1.8400000000000003e-05, 'epoch': 0.24}
35
+ {'loss': 0.9272, 'grad_norm': 0.860996663570404, 'learning_rate':
36
+ 1.826666666666667e-05, 'epoch': 0.26}
37
+ {'loss': 0.9277, 'grad_norm': 0.8613474369049072, 'learning_rate':
38
+ 1.8133333333333335e-05, 'epoch': 0.28}
39
+ {'loss': 0.9293, 'grad_norm': 0.8491615653038025, 'learning_rate': 1.8e-05,
40
+ 'epoch': 0.3}
41
+ {'loss': 0.917, 'grad_norm': 0.9264828562736511, 'learning_rate':
42
+ 1.7866666666666666e-05, 'epoch': 0.32}
43
+ {'loss': 0.9325, 'grad_norm': 0.8633733987808228, 'learning_rate':
44
+ 1.7733333333333335e-05, 'epoch': 0.34}
45
+ {'loss': 0.9126, 'grad_norm': 0.8906639218330383, 'learning_rate': 1.76e-05,
46
+ 'epoch': 0.36}
47
+ {'loss': 0.9065, 'grad_norm': 0.885764479637146, 'learning_rate':
48
+ 1.7466666666666667e-05, 'epoch': 0.38}
49
+ {'loss': 0.9429, 'grad_norm': 0.8839893937110901, 'learning_rate':
50
+ 1.7333333333333336e-05, 'epoch': 0.4}
51
+ {'loss': 0.9056, 'grad_norm': 0.913644552230835, 'learning_rate': 1.72e-05,
52
+ 'epoch': 0.42}
53
+ {'loss': 0.913, 'grad_norm': 0.8399360179901123, 'learning_rate':
54
+ 1.706666666666667e-05, 'epoch': 0.44}
55
+ {'loss': 0.9192, 'grad_norm': 0.8176397681236267, 'learning_rate':
56
+ 1.6933333333333336e-05, 'epoch': 0.46}
57
+ {'loss': 0.9222, 'grad_norm': 0.8771522045135498, 'learning_rate':
58
+ 1.6800000000000002e-05, 'epoch': 0.48}
59
+ {'loss': 0.9197, 'grad_norm': 0.7874905467033386, 'learning_rate':
60
+ 1.6666666666666667e-05, 'epoch': 0.5}
61
+ {'loss': 0.91, 'grad_norm': 0.8051652312278748, 'learning_rate':
62
+ 1.6533333333333333e-05, 'epoch': 0.52}
63
+ {'loss': 0.9091, 'grad_norm': 0.80189049243927, 'learning_rate': 1.64e-05,
64
+ 'epoch': 0.54}
65
+ {'loss': 0.9111, 'grad_norm': 0.7678588032722473, 'learning_rate':
66
+ 1.6266666666666668e-05, 'epoch': 0.56}
67
+ {'loss': 0.9127, 'grad_norm': 0.7835684418678284, 'learning_rate':
68
+ 1.6133333333333334e-05, 'epoch': 0.58}
69
+ {'loss': 0.9038, 'grad_norm': 0.8735753297805786, 'learning_rate':
70
+ 1.6000000000000003e-05, 'epoch': 0.6}
71
+ {'loss': 0.9148, 'grad_norm': 0.8635367155075073, 'learning_rate':
72
+ 1.586666666666667e-05, 'epoch': 0.62}
73
+ {'loss': 0.9123, 'grad_norm': 0.8477209210395813, 'learning_rate':
74
+ 1.5733333333333334e-05, 'epoch': 0.64}
75
+ {'loss': 0.8898, 'grad_norm': 0.7765554785728455, 'learning_rate':
76
+ 1.5600000000000003e-05, 'epoch': 0.66}
77
+ {'loss': 0.8992, 'grad_norm': 0.891146183013916, 'learning_rate':
78
+ 1.546666666666667e-05, 'epoch': 0.68}
79
+ {'loss': 0.9114, 'grad_norm': 0.8695024251937866, 'learning_rate':
80
+ 1.5333333333333334e-05, 'epoch': 0.7}
81
+ {'loss': 0.89, 'grad_norm': 0.927379846572876, 'learning_rate':
82
+ 1.5200000000000002e-05, 'epoch': 0.72}
83
+ {'loss': 0.9159, 'grad_norm': 0.885179340839386, 'learning_rate':
84
+ 1.5066666666666668e-05, 'epoch': 0.74}
85
+ {'loss': 0.9115, 'grad_norm': 0.9603422284126282, 'learning_rate':
86
+ 1.4933333333333335e-05, 'epoch': 0.76}
87
+ {'loss': 0.9025, 'grad_norm': 0.8624181151390076, 'learning_rate': 1.48e-05,
88
+ 'epoch': 0.78}
89
+ {'loss': 0.8984, 'grad_norm': 0.9430868625640869, 'learning_rate':
90
+ 1.4666666666666666e-05, 'epoch': 0.8}
91
+ {'loss': 0.8887, 'grad_norm': 0.7860404849052429, 'learning_rate':
92
+ 1.4533333333333335e-05, 'epoch': 0.82}
93
+ {'loss': 0.9097, 'grad_norm': 0.793118417263031, 'learning_rate':
94
+ 1.4400000000000001e-05, 'epoch': 0.84}
95
+ {'loss': 0.8919, 'grad_norm': 0.8741154074668884, 'learning_rate':
96
+ 1.4266666666666668e-05, 'epoch': 0.86}
97
+ {'loss': 0.8927, 'grad_norm': 0.7793194651603699, 'learning_rate':
98
+ 1.4133333333333334e-05, 'epoch': 0.88}
99
+ {'loss': 0.9032, 'grad_norm': 0.7923697233200073, 'learning_rate': 1.4e-05,
100
+ 'epoch': 0.9}
101
+ {'loss': 0.8878, 'grad_norm': 0.8406550288200378, 'learning_rate':
102
+ 1.3866666666666669e-05, 'epoch': 0.92}
103
+ {'loss': 0.9032, 'grad_norm': 1.0170931816101074, 'learning_rate':
104
+ 1.3733333333333335e-05, 'epoch': 0.94}
105
+ {'loss': 0.9026, 'grad_norm': 0.8855489492416382, 'learning_rate':
106
+ 1.3600000000000002e-05, 'epoch': 0.96}
107
+ {'loss': 0.8978, 'grad_norm': 0.7720271348953247, 'learning_rate':
108
+ 1.3466666666666668e-05, 'epoch': 0.98}
109
+ {'loss': 0.8958, 'grad_norm': 0.841073751449585, 'learning_rate':
110
+ 1.3333333333333333e-05, 'epoch': 1.0}
111
+ {'eval_loss': 0.88493812084198, 'eval_runtime': 62.115,
112
+ 'eval_samples_per_second': 16.099, 'eval_steps_per_second': 2.012, 'epoch':
113
+ 1.0}
114
+ {'loss': 0.7886, 'grad_norm': 0.856351375579834, 'learning_rate':
115
+ 1.3200000000000002e-05, 'epoch': 1.02}
116
+ {'loss': 0.7957, 'grad_norm': 0.92889404296875, 'learning_rate':
117
+ 1.3066666666666668e-05, 'epoch': 1.04}
118
+ {'loss': 0.7887, 'grad_norm': 0.8512794375419617, 'learning_rate':
119
+ 1.2933333333333334e-05, 'epoch': 1.06}
120
+ {'loss': 0.8119, 'grad_norm': 0.8943263292312622, 'learning_rate':
121
+ 1.2800000000000001e-05, 'epoch': 1.08}
122
+ {'loss': 0.7958, 'grad_norm': 0.908469557762146, 'learning_rate':
123
+ 1.2666666666666667e-05, 'epoch': 1.1}
124
+ {'loss': 0.7958, 'grad_norm': 0.790229856967926, 'learning_rate':
125
+ 1.2533333333333336e-05, 'epoch': 1.12}
126
+ {'loss': 0.7907, 'grad_norm': 1.0110950469970703, 'learning_rate':
127
+ 1.2400000000000002e-05, 'epoch': 1.14}
128
+ {'loss': 0.7939, 'grad_norm': 0.9110943078994751, 'learning_rate':
129
+ 1.2266666666666667e-05, 'epoch': 1.16}
130
+ {'loss': 0.7849, 'grad_norm': 0.8515949249267578, 'learning_rate':
131
+ 1.2133333333333335e-05, 'epoch': 1.18}
132
+ {'loss': 0.8036, 'grad_norm': 0.8418060541152954, 'learning_rate': 1.2e-05,
133
+ 'epoch': 1.2}
134
+ {'loss': 0.8002, 'grad_norm': 0.8303418159484863, 'learning_rate':
135
+ 1.186666666666667e-05, 'epoch': 1.22}
136
+ {'loss': 0.8005, 'grad_norm': 0.8996633291244507, 'learning_rate':
137
+ 1.1733333333333335e-05, 'epoch': 1.24}
138
+ {'loss': 0.7956, 'grad_norm': 0.8889008164405823, 'learning_rate': 1.16e-05,
139
+ 'epoch': 1.26}
140
+ {'loss': 0.786, 'grad_norm': 0.8718474507331848, 'learning_rate':
141
+ 1.1466666666666668e-05, 'epoch': 1.28}
142
+ {'loss': 0.8017, 'grad_norm': 0.931644856929779, 'learning_rate':
143
+ 1.1333333333333334e-05, 'epoch': 1.3}
144
+ {'loss': 0.7855, 'grad_norm': 0.873427152633667, 'learning_rate':
145
+ 1.1200000000000001e-05, 'epoch': 1.32}
146
+ {'loss': 0.802, 'grad_norm': 0.8685030341148376, 'learning_rate':
147
+ 1.1066666666666669e-05, 'epoch': 1.34}
148
+ {'loss': 0.7936, 'grad_norm': 0.8744147419929504, 'learning_rate':
149
+ 1.0933333333333334e-05, 'epoch': 1.36}
150
+ {'loss': 0.7864, 'grad_norm': 0.9263073205947876, 'learning_rate':
151
+ 1.0800000000000002e-05, 'epoch': 1.38}
152
+ {'loss': 0.7886, 'grad_norm': 0.8322786688804626, 'learning_rate':
153
+ 1.0666666666666667e-05, 'epoch': 1.4}
154
+ {'loss': 0.7889, 'grad_norm': 0.8734496235847473, 'learning_rate':
155
+ 1.0533333333333333e-05, 'epoch': 1.42}
156
+ {'loss': 0.8092, 'grad_norm': 1.0558969974517822, 'learning_rate': 1.04e-05,
157
+ 'epoch': 1.44}
158
+ {'loss': 0.8061, 'grad_norm': 0.8517795205116272, 'learning_rate':
159
+ 1.0266666666666668e-05, 'epoch': 1.46}
160
+ {'loss': 0.8073, 'grad_norm': 0.8912076354026794, 'learning_rate':
161
+ 1.0133333333333335e-05, 'epoch': 1.48}
162
+ {'loss': 0.789, 'grad_norm': 0.8496484756469727, 'learning_rate': 1e-05,
163
+ 'epoch': 1.5}
164
+ {'loss': 0.7813, 'grad_norm': 0.8365699648857117, 'learning_rate':
165
+ 9.866666666666668e-06, 'epoch': 1.52}
166
+ {'loss': 0.7923, 'grad_norm': 0.8612975478172302, 'learning_rate':
167
+ 9.733333333333334e-06, 'epoch': 1.54}
168
+ {'loss': 0.8017, 'grad_norm': 0.825588047504425, 'learning_rate':
169
+ 9.600000000000001e-06, 'epoch': 1.56}
170
+ {'loss': 0.7975, 'grad_norm': 0.9208545684814453, 'learning_rate':
171
+ 9.466666666666667e-06, 'epoch': 1.58}
172
+ {'loss': 0.8023, 'grad_norm': 0.8390321731567383, 'learning_rate':
173
+ 9.333333333333334e-06, 'epoch': 1.6}
174
+ {'loss': 0.7965, 'grad_norm': 0.9178739786148071, 'learning_rate':
175
+ 9.200000000000002e-06, 'epoch': 1.62}
176
+ {'loss': 0.7977, 'grad_norm': 0.9349595904350281, 'learning_rate':
177
+ 9.066666666666667e-06, 'epoch': 1.64}
178
+ {'loss': 0.7898, 'grad_norm': 0.8347993493080139, 'learning_rate':
179
+ 8.933333333333333e-06, 'epoch': 1.66}
180
+ {'loss': 0.7903, 'grad_norm': 0.8217402696609497, 'learning_rate': 8.8e-06,
181
+ 'epoch': 1.68}
182
+ {'loss': 0.8032, 'grad_norm': 0.8198782801628113, 'learning_rate':
183
+ 8.666666666666668e-06, 'epoch': 1.7}
184
+ {'loss': 0.7967, 'grad_norm': 0.8704864382743835, 'learning_rate':
185
+ 8.533333333333335e-06, 'epoch': 1.72}
186
+ {'loss': 0.7884, 'grad_norm': 0.892342746257782, 'learning_rate':
187
+ 8.400000000000001e-06, 'epoch': 1.74}
188
+ {'loss': 0.7859, 'grad_norm': 0.9803333282470703, 'learning_rate':
189
+ 8.266666666666667e-06, 'epoch': 1.76}
190
+ {'loss': 0.7872, 'grad_norm': 0.8991880416870117, 'learning_rate':
191
+ 8.133333333333334e-06, 'epoch': 1.78}
192
+ {'loss': 0.7967, 'grad_norm': 1.0206116437911987, 'learning_rate':
193
+ 8.000000000000001e-06, 'epoch': 1.8}
194
+ {'loss': 0.7893, 'grad_norm': 0.8060243725776672, 'learning_rate':
195
+ 7.866666666666667e-06, 'epoch': 1.82}
196
+ {'loss': 0.7985, 'grad_norm': 0.8802576065063477, 'learning_rate':
197
+ 7.733333333333334e-06, 'epoch': 1.84}
198
+ {'loss': 0.7889, 'grad_norm': 0.8549374341964722, 'learning_rate':
199
+ 7.600000000000001e-06, 'epoch': 1.86}
200
+ {'loss': 0.8061, 'grad_norm': 0.9457976222038269, 'learning_rate':
201
+ 7.4666666666666675e-06, 'epoch': 1.88}
202
+ {'loss': 0.7866, 'grad_norm': 0.8606815338134766, 'learning_rate':
203
+ 7.333333333333333e-06, 'epoch': 1.9}
204
+ {'loss': 0.7951, 'grad_norm': 0.8577678799629211, 'learning_rate':
205
+ 7.2000000000000005e-06, 'epoch': 1.92}
206
+ {'loss': 0.7879, 'grad_norm': 0.9072070717811584, 'learning_rate':
207
+ 7.066666666666667e-06, 'epoch': 1.94}
208
+ {'loss': 0.7936, 'grad_norm': 0.8739129304885864, 'learning_rate':
209
+ 6.9333333333333344e-06, 'epoch': 1.96}
210
+ {'loss': 0.7825, 'grad_norm': 0.8881368637084961, 'learning_rate':
211
+ 6.800000000000001e-06, 'epoch': 1.98}
212
+ {'loss': 0.7837, 'grad_norm': 0.9448174834251404, 'learning_rate':
213
+ 6.666666666666667e-06, 'epoch': 2.0}
214
+ {'eval_loss': 0.8766704201698303, 'eval_runtime': 64.0882,
215
+ 'eval_samples_per_second': 15.604, 'eval_steps_per_second': 1.95, 'epoch':
216
+ 2.0}
217
+ {'loss': 0.7014, 'grad_norm': 0.8892523050308228, 'learning_rate':
218
+ 6.534666666666667e-06, 'epoch': 2.02}
219
+ {'loss': 0.6962, 'grad_norm': 0.9131534695625305, 'learning_rate':
220
+ 6.4013333333333334e-06, 'epoch': 2.04}
221
+ {'loss': 0.6874, 'grad_norm': 0.9470012784004211, 'learning_rate':
222
+ 6.268000000000001e-06, 'epoch': 2.06}
223
+ {'loss': 0.7029, 'grad_norm': 0.9785165190696716, 'learning_rate':
224
+ 6.134666666666667e-06, 'epoch': 2.08}
225
+ {'loss': 0.6922, 'grad_norm': 1.0245338678359985, 'learning_rate':
226
+ 6.001333333333334e-06, 'epoch': 2.1}
227
+ {'loss': 0.6973, 'grad_norm': 0.920975387096405, 'learning_rate': 5.868e-06,
228
+ 'epoch': 2.12}
229
+ {'loss': 0.6917, 'grad_norm': 0.9391937851905823, 'learning_rate':
230
+ 5.734666666666667e-06, 'epoch': 2.14}
231
+ {'loss': 0.6965, 'grad_norm': 0.9682599306106567, 'learning_rate':
232
+ 5.601333333333334e-06, 'epoch': 2.16}
233
+ {'loss': 0.698, 'grad_norm': 1.0669585466384888, 'learning_rate': 5.468e-06,
234
+ 'epoch': 2.18}
235
+ {'loss': 0.6961, 'grad_norm': 1.0977643728256226, 'learning_rate':
236
+ 5.3346666666666665e-06, 'epoch': 2.2}
237
+ {'loss': 0.6935, 'grad_norm': 0.9608291387557983, 'learning_rate':
238
+ 5.201333333333334e-06, 'epoch': 2.22}
239
+ {'loss': 0.691, 'grad_norm': 0.9867018461227417, 'learning_rate':
240
+ 5.069333333333334e-06, 'epoch': 2.24}
241
+ {'loss': 0.6973, 'grad_norm': 0.9778249263763428, 'learning_rate': 4.936e-06,
242
+ 'epoch': 2.26}
243
+ {'loss': 0.6958, 'grad_norm': 1.040911316871643, 'learning_rate':
244
+ 4.802666666666667e-06, 'epoch': 2.28}
245
+ {'loss': 0.7075, 'grad_norm': 0.948652446269989, 'learning_rate':
246
+ 4.669333333333334e-06, 'epoch': 2.3}
247
+ {'loss': 0.6966, 'grad_norm': 1.045653223991394, 'learning_rate': 4.536e-06,
248
+ 'epoch': 2.32}
249
+ {'loss': 0.7036, 'grad_norm': 0.9575178623199463, 'learning_rate':
250
+ 4.4040000000000005e-06, 'epoch': 2.34}
251
+ {'loss': 0.7038, 'grad_norm': 0.8968670964241028, 'learning_rate':
252
+ 4.270666666666667e-06, 'epoch': 2.36}
253
+ {'loss': 0.687, 'grad_norm': 0.9445046186447144, 'learning_rate':
254
+ 4.137333333333334e-06, 'epoch': 2.38}
255
+ {'loss': 0.6978, 'grad_norm': 1.0018490552902222, 'learning_rate': 4.004e-06,
256
+ 'epoch': 2.4}
257
+ {'loss': 0.6876, 'grad_norm': 0.9981265664100647, 'learning_rate':
258
+ 3.870666666666667e-06, 'epoch': 2.42}
259
+ {'loss': 0.691, 'grad_norm': 0.9957240223884583, 'learning_rate':
260
+ 3.737333333333333e-06, 'epoch': 2.44}
261
+ {'loss': 0.7026, 'grad_norm': 1.0652461051940918, 'learning_rate': 3.604e-06,
262
+ 'epoch': 2.46}
263
+ {'loss': 0.6918, 'grad_norm': 0.9067023992538452, 'learning_rate':
264
+ 3.470666666666667e-06, 'epoch': 2.48}
265
+ {'loss': 0.7013, 'grad_norm': 0.997002899646759, 'learning_rate':
266
+ 3.3373333333333336e-06, 'epoch': 2.5}
267
+ {'loss': 0.7025, 'grad_norm': 0.914594829082489, 'learning_rate':
268
+ 3.2040000000000006e-06, 'epoch': 2.52}
269
+ {'loss': 0.7053, 'grad_norm': 1.197375774383545, 'learning_rate':
270
+ 3.0706666666666667e-06, 'epoch': 2.54}
271
+ {'loss': 0.6888, 'grad_norm': 1.0545796155929565, 'learning_rate':
272
+ 2.9373333333333336e-06, 'epoch': 2.56}
273
+ {'loss': 0.6853, 'grad_norm': 1.0479904413223267, 'learning_rate': 2.804e-06,
274
+ 'epoch': 2.58}
275
+ {'loss': 0.7053, 'grad_norm': 0.9415286183357239, 'learning_rate':
276
+ 2.670666666666667e-06, 'epoch': 2.6}
277
+ {'loss': 0.7093, 'grad_norm': 0.9765642881393433, 'learning_rate':
278
+ 2.5373333333333332e-06, 'epoch': 2.62}
279
+ {'loss': 0.7026, 'grad_norm': 0.9646571278572083, 'learning_rate': 2.404e-06,
280
+ 'epoch': 2.64}
281
+ {'loss': 0.6967, 'grad_norm': 0.9278367757797241, 'learning_rate':
282
+ 2.2706666666666667e-06, 'epoch': 2.66}
283
+ {'loss': 0.699, 'grad_norm': 0.9418883323669434, 'learning_rate':
284
+ 2.1373333333333337e-06, 'epoch': 2.68}
285
+ {'loss': 0.6937, 'grad_norm': 1.0113035440444946, 'learning_rate': 2.004e-06,
286
+ 'epoch': 2.7}
287
+ {'loss': 0.6795, 'grad_norm': 1.0150426626205444, 'learning_rate':
288
+ 1.8706666666666667e-06, 'epoch': 2.72}
289
+ {'loss': 0.6822, 'grad_norm': 1.177462100982666, 'learning_rate':
290
+ 1.7373333333333333e-06, 'epoch': 2.74}
291
+ {'loss': 0.7039, 'grad_norm': 1.1305912733078003, 'learning_rate': 1.604e-06,
292
+ 'epoch': 2.76}
293
+ {'loss': 0.6864, 'grad_norm': 0.9975621700286865, 'learning_rate':
294
+ 1.470666666666667e-06, 'epoch': 2.78}
295
+ {'loss': 0.6945, 'grad_norm': 1.0691418647766113, 'learning_rate':
296
+ 1.3373333333333335e-06, 'epoch': 2.8}
297
+ {'loss': 0.6891, 'grad_norm': 0.988650381565094, 'learning_rate': 1.204e-06,
298
+ 'epoch': 2.82}
299
+ {'loss': 0.6873, 'grad_norm': 1.0204881429672241, 'learning_rate':
300
+ 1.0706666666666668e-06, 'epoch': 2.84}
301
+ {'loss': 0.691, 'grad_norm': 1.0628246068954468, 'learning_rate':
302
+ 9.373333333333334e-07, 'epoch': 2.86}
303
+ {'loss': 0.6938, 'grad_norm': 1.058432698249817, 'learning_rate': 8.04e-07,
304
+ 'epoch': 2.88}
305
+ {'loss': 0.7113, 'grad_norm': 1.0059112310409546, 'learning_rate':
306
+ 6.706666666666667e-07, 'epoch': 2.9}
307
+ {'loss': 0.6831, 'grad_norm': 0.963862955570221, 'learning_rate':
308
+ 5.373333333333334e-07, 'epoch': 2.92}
309
+ {'loss': 0.7055, 'grad_norm': 0.9393936395645142, 'learning_rate': 4.04e-07,
310
+ 'epoch': 2.94}
311
+ {'loss': 0.6944, 'grad_norm': 1.1199434995651245, 'learning_rate':
312
+ 2.706666666666667e-07, 'epoch': 2.96}
313
+ {'loss': 0.7033, 'grad_norm': 1.1279562711715698, 'learning_rate':
314
+ 1.3733333333333335e-07, 'epoch': 2.98}
315
+ {'loss': 0.6902, 'grad_norm': 0.9235104918479919, 'learning_rate': 4e-09,
316
+ 'epoch': 3.0}
317
+ {'eval_loss': 0.8971832394599915, 'eval_runtime': 53.8519,
318
+ 'eval_samples_per_second': 18.569, 'eval_steps_per_second': 2.321, 'epoch':
319
+ 3.0}
320
+ {'train_runtime': 69618.6499, 'train_samples_per_second': 6.895,
321
+ 'train_steps_per_second': 0.215, 'train_loss': 0.8021132176717123, 'epoch':
322
+ 3.0}
323
+