cterdam commited on
Commit
9f14619
·
verified ·
1 Parent(s): a61b90a

Upload folder using huggingface_hub

Browse files
command.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ main.py deepseek None runs/deepseek-ctrl-gh/checkpoint-15000
config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "runs/deepseek-ctrl-gh/checkpoint-15000",
3
+ "architectures": [
4
+ "LlamaForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 32013,
9
+ "eos_token_id": 32021,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 2048,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 5504,
14
+ "max_position_embeddings": 16384,
15
+ "model_type": "llama",
16
+ "num_attention_heads": 16,
17
+ "num_hidden_layers": 24,
18
+ "num_key_value_heads": 16,
19
+ "pretraining_tp": 1,
20
+ "rms_norm_eps": 1e-06,
21
+ "rope_scaling": {
22
+ "factor": 4.0,
23
+ "type": "linear"
24
+ },
25
+ "rope_theta": 100000,
26
+ "tie_word_embeddings": false,
27
+ "torch_dtype": "float32",
28
+ "transformers_version": "4.38.2",
29
+ "use_cache": true,
30
+ "vocab_size": 32256
31
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 32013,
4
+ "eos_token_id": 32021,
5
+ "transformers_version": "4.38.2"
6
+ }
model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6d19cff5cedf87734ca26cc1433c8416fc0acccb1d6e0a72416902d5f93efb9
3
+ size 4986380064
model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47f52fb9188137c93237bc2d29f5bad8c88cc585613e0b2dbad3afb23a3c2f22
3
+ size 399532808
model.safetensors.index.json ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 5385887744
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "model-00002-of-00002.safetensors",
7
+ "model.embed_tokens.weight": "model-00001-of-00002.safetensors",
8
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
9
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
10
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
11
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
12
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
13
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
14
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
15
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
16
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
17
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
18
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
19
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
20
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
21
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
22
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
23
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
24
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
25
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
26
+ "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
27
+ "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
28
+ "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
29
+ "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
30
+ "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
31
+ "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
32
+ "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
33
+ "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
34
+ "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
35
+ "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
36
+ "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
37
+ "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
38
+ "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
39
+ "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
40
+ "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
41
+ "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
42
+ "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
43
+ "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
44
+ "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
45
+ "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
46
+ "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
47
+ "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
48
+ "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
49
+ "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
50
+ "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
51
+ "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
52
+ "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
53
+ "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
54
+ "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
55
+ "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
56
+ "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
57
+ "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
58
+ "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
59
+ "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
60
+ "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
61
+ "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
62
+ "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
63
+ "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
64
+ "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
65
+ "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
66
+ "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
67
+ "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
68
+ "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
69
+ "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
70
+ "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
71
+ "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
72
+ "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
73
+ "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
74
+ "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
75
+ "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
76
+ "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
77
+ "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
78
+ "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
79
+ "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
80
+ "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
81
+ "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
82
+ "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
83
+ "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
84
+ "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
85
+ "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
86
+ "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
87
+ "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
88
+ "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
89
+ "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
90
+ "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
91
+ "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
92
+ "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
93
+ "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
94
+ "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
95
+ "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
96
+ "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
97
+ "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
98
+ "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors",
99
+ "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
100
+ "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
101
+ "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
102
+ "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
103
+ "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
104
+ "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
105
+ "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
106
+ "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
107
+ "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors",
108
+ "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
109
+ "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
110
+ "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
111
+ "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
112
+ "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
113
+ "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
114
+ "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
115
+ "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
116
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
117
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
118
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
119
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
120
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
121
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
122
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
123
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
124
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
125
+ "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors",
126
+ "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
127
+ "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
128
+ "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
129
+ "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
130
+ "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
131
+ "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
132
+ "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
133
+ "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
134
+ "model.layers.21.input_layernorm.weight": "model-00001-of-00002.safetensors",
135
+ "model.layers.21.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
136
+ "model.layers.21.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
137
+ "model.layers.21.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
138
+ "model.layers.21.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
139
+ "model.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
140
+ "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
141
+ "model.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
142
+ "model.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
143
+ "model.layers.22.input_layernorm.weight": "model-00001-of-00002.safetensors",
144
+ "model.layers.22.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
145
+ "model.layers.22.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
146
+ "model.layers.22.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
147
+ "model.layers.22.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
148
+ "model.layers.22.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
149
+ "model.layers.22.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
150
+ "model.layers.22.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
151
+ "model.layers.22.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
152
+ "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors",
153
+ "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
154
+ "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
155
+ "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
156
+ "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
157
+ "model.layers.23.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
158
+ "model.layers.23.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
159
+ "model.layers.23.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
160
+ "model.layers.23.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
161
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
162
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
163
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
164
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
165
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
166
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
167
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
168
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
169
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
170
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
171
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
172
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
173
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
174
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
175
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
176
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
177
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
178
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
179
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
180
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
181
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
182
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
183
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
184
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
185
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
186
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
187
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
188
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
189
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
190
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
191
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
192
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
193
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
194
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
195
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
196
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
197
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
198
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
199
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
200
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
201
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
202
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
203
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
204
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
205
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
206
+ "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
207
+ "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
208
+ "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
209
+ "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
210
+ "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
211
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
212
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
213
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
214
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
215
+ "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
216
+ "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
217
+ "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
218
+ "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
219
+ "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
220
+ "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
221
+ "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
222
+ "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
223
+ "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
224
+ "model.norm.weight": "model-00002-of-00002.safetensors"
225
+ }
226
+ }
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8b617ab3f5d2199c0f65179969c8a22980f3543a4d0a52b05caea65094020bb
3
+ size 2699039674
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c52295d3807e2e59216b7e1ea6b0ab41cccc585ef6a638edbc526508897829a6
3
+ size 14180
run_config.json ADDED
@@ -0,0 +1,602 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": {
3
+ "codellama": {
4
+ "base_model_id": "codellama/CodeLlama-7b-hf",
5
+ "quantitize": "int8",
6
+ "dataset": "Arithmetic_Simple",
7
+ "data_collator": "DataCollatorForSeq2Seq",
8
+ "lora_config": {
9
+ "r": 16,
10
+ "lora_alpha": 16,
11
+ "target_modules": [
12
+ "q_proj",
13
+ "k_proj",
14
+ "v_proj",
15
+ "o_proj",
16
+ "gate_proj",
17
+ "up_proj",
18
+ "down_proj"
19
+ ],
20
+ "lora_dropout": 0.05,
21
+ "bias": "none",
22
+ "task_type": "CAUSAL_LM"
23
+ },
24
+ "training_args": {
25
+ "output_dir": "codellama-output",
26
+ "warmup_steps": 100,
27
+ "per_device_train_batch_size": 1,
28
+ "per_device_eval_batch_size": 1,
29
+ "gradient_accumulation_steps": 4,
30
+ "max_steps": 10000,
31
+ "learning_rate": 0.0003,
32
+ "optim": "adamw_torch",
33
+ "logging_dir": "codellama-output-logs",
34
+ "logging_steps": 10,
35
+ "save_strategy": "steps",
36
+ "save_steps": 500,
37
+ "load_best_model_at_end": false,
38
+ "group_by_length": true,
39
+ "fp16": true,
40
+ "evaluation_strategy": "steps",
41
+ "eval_steps": 1000
42
+ },
43
+ "tokenizer": {
44
+ "tokenize_config": {
45
+ "truncation": true,
46
+ "max_length": 192,
47
+ "padding": "max_length"
48
+ },
49
+ "prompt_template": "config/qa_template.txt"
50
+ }
51
+ },
52
+ "phi-2": {
53
+ "base_model_id": "microsoft/phi-2",
54
+ "quantitize": "fp16",
55
+ "dataset": "Arithmetic_Simple",
56
+ "data_collator": "DataCollatorForLanguageModeling",
57
+ "lora_config": {
58
+ "r": 32,
59
+ "lora_alpha": 64,
60
+ "target_modules": [
61
+ "q_proj",
62
+ "k_proj",
63
+ "v_proj",
64
+ "dense",
65
+ "fc1",
66
+ "fc2"
67
+ ],
68
+ "bias": "none",
69
+ "lora_dropout": 0.05,
70
+ "task_type": "CAUSAL_LM"
71
+ },
72
+ "training_args": {
73
+ "output_dir": "phi2-output",
74
+ "warmup_steps": 500,
75
+ "per_device_train_batch_size": 1,
76
+ "per_device_eval_batch_size": 1,
77
+ "gradient_accumulation_steps": 4,
78
+ "max_steps": 100000,
79
+ "learning_rate": 0.0003,
80
+ "optim": "paged_adamw_8bit",
81
+ "logging_dir": "phi2-output-logs",
82
+ "logging_steps": 100,
83
+ "save_strategy": "steps",
84
+ "save_steps": 500,
85
+ "evaluation_strategy": "steps",
86
+ "eval_steps": 500,
87
+ "fp16": true
88
+ },
89
+ "tokenizer": {
90
+ "tokenize_config": {
91
+ "truncation": true,
92
+ "max_length": 512,
93
+ "padding": "max_length"
94
+ },
95
+ "prompt_template": "config/qa_template.txt"
96
+ }
97
+ },
98
+ "deepseek": {
99
+ "base_model_id": "deepseek-ai/deepseek-coder-1.3b-instruct",
100
+ "quantitize": "fp16",
101
+ "dataset": "mixture_codegen",
102
+ "data_collator": "DataCollatorForLanguageModeling",
103
+ "lora_config": {
104
+ "r": 32,
105
+ "lora_alpha": 64,
106
+ "target_modules": [
107
+ "q_proj",
108
+ "k_proj",
109
+ "v_proj",
110
+ "o_proj",
111
+ "gate_proj",
112
+ "up_proj",
113
+ "down_proj"
114
+ ],
115
+ "bias": "none",
116
+ "lora_dropout": 0.05,
117
+ "task_type": "CAUSAL_LM"
118
+ },
119
+ "lora_large_config": {
120
+ "r": 128,
121
+ "lora_alpha": 256,
122
+ "target_modules": [
123
+ "q_proj",
124
+ "k_proj",
125
+ "v_proj",
126
+ "o_proj",
127
+ "gate_proj",
128
+ "up_proj",
129
+ "down_proj"
130
+ ],
131
+ "bias": "none",
132
+ "lora_dropout": 0.05,
133
+ "task_type": "CAUSAL_LM"
134
+ },
135
+ "p_tuning_config": {
136
+ "num_virtual_tokens": 16,
137
+ "num_transformer_submodules": 1,
138
+ "token_dim": 2048,
139
+ "encoder_hidden_size": 2048,
140
+ "task_type": "CAUSAL_LM"
141
+ },
142
+ "training_args": {
143
+ "output_dir": "runs/deepseek-ctrl-gh-mixture",
144
+ "warmup_steps": 0,
145
+ "per_device_train_batch_size": 4,
146
+ "per_device_eval_batch_size": 4,
147
+ "gradient_accumulation_steps": 8,
148
+ "max_steps": 5000,
149
+ "learning_rate": 2e-05,
150
+ "optim": "paged_adamw_8bit",
151
+ "logging_dir": "runs/deepseek-ctrl-gh-mixture/logs",
152
+ "logging_steps": 100,
153
+ "save_strategy": "steps",
154
+ "save_steps": 2500,
155
+ "evaluation_strategy": "steps",
156
+ "eval_steps": 2500,
157
+ "weight_decay": 0.01,
158
+ "fp16": true
159
+ },
160
+ "tokenizer": {
161
+ "tokenize_config": {
162
+ "truncation": true,
163
+ "max_length": 1024,
164
+ "padding": "max_length"
165
+ },
166
+ "prompt_template": "config/qa_template.txt"
167
+ }
168
+ }
169
+ },
170
+ "dataset": {
171
+ "simple_dataset": {
172
+ "type": "huggingface",
173
+ "dataset_purpose": "downstream",
174
+ "name": "b-mc2/sql-create-context",
175
+ "train_split": 0.9,
176
+ "max_train_size": 100,
177
+ "filling_field": [
178
+ "question",
179
+ "context",
180
+ "answer"
181
+ ]
182
+ },
183
+ "testdset": {
184
+ "type": "local",
185
+ "dataset_purpose": "downstream",
186
+ "train_file": "data/Test/TestDataset.json",
187
+ "val_file": "data/Test/TestDataset.json",
188
+ "test_file": "data/Test/TestDataset.json",
189
+ "filling_field": [
190
+ "prompted_question",
191
+ "answer"
192
+ ]
193
+ },
194
+ "mixture_codegen": {
195
+ "filling_field": [
196
+ "Question",
197
+ "Answer"
198
+ ],
199
+ "dataset_purpose": "downstream"
200
+ },
201
+ "MathQA_Python_loader": {
202
+ "type": "list-like",
203
+ "dataset_purpose": "downstream",
204
+ "train": "data/MathQA_Python_processed/mathqa_python_train_clean_final.json",
205
+ "val": "data/MathQA_Python_processed/mathqa_python_dev_clean_final.json",
206
+ "test": "data/MathQA_Python_processed/mathqa_python_test_clean_final.json",
207
+ "filling_field": [
208
+ "Question",
209
+ "Answer"
210
+ ]
211
+ },
212
+ "APPS_loader": {
213
+ "type": "list-like",
214
+ "dataset_purpose": "downstream",
215
+ "train": "data/APPS/apps_train.json",
216
+ "val": "data/APPS/apps_dev.json",
217
+ "test": "data/APPS/test/apps_test_75.json",
218
+ "filling_field": [
219
+ "Question",
220
+ "Answer"
221
+ ]
222
+ },
223
+ "MBPP_loader": {
224
+ "type": "list-like",
225
+ "dataset_purpose": "downstream",
226
+ "train": "data/MBPP/mbpp_train.json",
227
+ "val": "data/MBPP/mbpp_dev.json",
228
+ "test": "data/MBPP/mbpp_test.json",
229
+ "filling_field": [
230
+ "Question",
231
+ "Answer"
232
+ ]
233
+ },
234
+ "Arithmetic_Simple": {
235
+ "type": "list-like",
236
+ "dataset_purpose": "downstream",
237
+ "attributes": {
238
+ "subjects": [
239
+ 1,
240
+ 2,
241
+ 3,
242
+ 4,
243
+ 5,
244
+ 6,
245
+ 7,
246
+ 8,
247
+ 9
248
+ ],
249
+ "lessons": [
250
+ "Max_Ops1_Bounds0_100",
251
+ "Max_Ops1_Bounds0_1000",
252
+ "Max_Ops2_Bounds0_100",
253
+ "Max_Ops2_Bounds0_1000",
254
+ "Max_Ops3_Bounds0_100",
255
+ "Max_Ops3_Bounds0_1000",
256
+ "Max_Ops4_Bounds0_100",
257
+ "Max_Ops4_Bounds0_1000",
258
+ "Max_Ops5_Bounds0_100",
259
+ "Max_Ops5_Bounds0_1000"
260
+ ]
261
+ },
262
+ "train": "data/Arithmetic/Curriculum_Simple",
263
+ "val": "data/Arithmetic/Curriculum_Simple",
264
+ "test": "data/Arithmetic/Curriculum_Simple",
265
+ "filling_field": [
266
+ "Question",
267
+ "Answer"
268
+ ]
269
+ },
270
+ "Arithmetic_Hard": {
271
+ "type": "list-like",
272
+ "dataset_purpose": "downstream",
273
+ "attributes": {
274
+ "subjects": [
275
+ 1,
276
+ 2,
277
+ 3,
278
+ 4,
279
+ 5,
280
+ 6,
281
+ 7,
282
+ 8,
283
+ 9
284
+ ],
285
+ "lessons": [
286
+ "Max_Ops1_Bounds-1000_1000",
287
+ "Max_Ops1_Bounds-100_100",
288
+ "Max_Ops1_Bounds0_100",
289
+ "Max_Ops1_Bounds0_1000",
290
+ "Max_Ops2_Bounds-1000_1000",
291
+ "Max_Ops2_Bounds-100_100",
292
+ "Max_Ops2_Bounds0_100",
293
+ "Max_Ops2_Bounds0_1000",
294
+ "Max_Ops3_Bounds-1000_1000",
295
+ "Max_Ops3_Bounds-100_100",
296
+ "Max_Ops3_Bounds0_100",
297
+ "Max_Ops3_Bounds0_1000",
298
+ "Max_Ops4_Bounds-1000_1000",
299
+ "Max_Ops4_Bounds-100_100",
300
+ "Max_Ops4_Bounds0_100",
301
+ "Max_Ops4_Bounds0_1000",
302
+ "Max_Ops5_Bounds-1000_1000",
303
+ "Max_Ops5_Bounds-100_100",
304
+ "Max_Ops5_Bounds0_100",
305
+ "Max_Ops5_Bounds0_1000",
306
+ "Max_Ops6_Bounds-1000_1000",
307
+ "Max_Ops6_Bounds-100_100",
308
+ "Max_Ops6_Bounds0_100",
309
+ "Max_Ops6_Bounds0_1000",
310
+ "Max_Ops7_Bounds-1000_1000",
311
+ "Max_Ops7_Bounds-100_100",
312
+ "Max_Ops7_Bounds0_100",
313
+ "Max_Ops7_Bounds0_1000",
314
+ "Max_Ops8_Bounds-1000_1000",
315
+ "Max_Ops8_Bounds-100_100",
316
+ "Max_Ops8_Bounds0_100",
317
+ "Max_Ops8_Bounds0_1000",
318
+ "Max_Ops9_Bounds-1000_1000",
319
+ "Max_Ops9_Bounds-100_100",
320
+ "Max_Ops9_Bounds0_100",
321
+ "Max_Ops9_Bounds0_1000",
322
+ "Max_Ops10_Bounds-1000_1000",
323
+ "Max_Ops10_Bounds-100_100",
324
+ "Max_Ops10_Bounds0_100",
325
+ "Max_Ops10_Bounds0_1000"
326
+ ]
327
+ },
328
+ "train": "data/Arithmetic/Curriculum_Hard",
329
+ "val": "data/Arithmetic/Curriculum_Hard",
330
+ "test": "data/Arithmetic/Curriculum_Hard",
331
+ "filling_field": [
332
+ "Question",
333
+ "Answer"
334
+ ]
335
+ },
336
+ "Arithmetic_Hard_prompt_C11": {
337
+ "type": "list-like",
338
+ "dataset_purpose": "downstream",
339
+ "attributes": {
340
+ "subjects": [
341
+ 1,
342
+ 2,
343
+ 3,
344
+ 4,
345
+ 5,
346
+ 6,
347
+ 7,
348
+ 8,
349
+ 9
350
+ ],
351
+ "lessons": [
352
+ "Max_Ops1_Bounds-1000_1000",
353
+ "Max_Ops1_Bounds-100_100",
354
+ "Max_Ops1_Bounds0_100",
355
+ "Max_Ops1_Bounds0_1000",
356
+ "Max_Ops2_Bounds-1000_1000",
357
+ "Max_Ops2_Bounds-100_100",
358
+ "Max_Ops2_Bounds0_100",
359
+ "Max_Ops2_Bounds0_1000",
360
+ "Max_Ops3_Bounds-1000_1000",
361
+ "Max_Ops3_Bounds-100_100",
362
+ "Max_Ops3_Bounds0_100",
363
+ "Max_Ops3_Bounds0_1000",
364
+ "Max_Ops4_Bounds-1000_1000",
365
+ "Max_Ops4_Bounds-100_100",
366
+ "Max_Ops4_Bounds0_100",
367
+ "Max_Ops4_Bounds0_1000",
368
+ "Max_Ops5_Bounds-1000_1000",
369
+ "Max_Ops5_Bounds-100_100",
370
+ "Max_Ops5_Bounds0_100",
371
+ "Max_Ops5_Bounds0_1000",
372
+ "Max_Ops6_Bounds-1000_1000",
373
+ "Max_Ops6_Bounds-100_100",
374
+ "Max_Ops6_Bounds0_100",
375
+ "Max_Ops6_Bounds0_1000",
376
+ "Max_Ops7_Bounds-1000_1000",
377
+ "Max_Ops7_Bounds-100_100",
378
+ "Max_Ops7_Bounds0_100",
379
+ "Max_Ops7_Bounds0_1000",
380
+ "Max_Ops8_Bounds-1000_1000",
381
+ "Max_Ops8_Bounds-100_100",
382
+ "Max_Ops8_Bounds0_100",
383
+ "Max_Ops8_Bounds0_1000",
384
+ "Max_Ops9_Bounds-1000_1000",
385
+ "Max_Ops9_Bounds-100_100",
386
+ "Max_Ops9_Bounds0_100",
387
+ "Max_Ops9_Bounds0_1000",
388
+ "Max_Ops10_Bounds-1000_1000",
389
+ "Max_Ops10_Bounds-100_100",
390
+ "Max_Ops10_Bounds0_100",
391
+ "Max_Ops10_Bounds0_1000"
392
+ ]
393
+ },
394
+ "train": "data/Arithmetic/Curriculum_Hard",
395
+ "val": "data/Arithmetic/Curriculum_Hard",
396
+ "test": "data/Arithmetic/Curriculum_Hard",
397
+ "filling_field": [
398
+ "Question",
399
+ "Answer"
400
+ ]
401
+ },
402
+ "Arithmetic_Hard_prompt_C12": {
403
+ "type": "list-like",
404
+ "dataset_purpose": "downstream",
405
+ "attributes": {
406
+ "subjects": [
407
+ 7,
408
+ 9
409
+ ],
410
+ "lessons": [
411
+ "Max_Ops1_Bounds-1000_1000",
412
+ "Max_Ops1_Bounds-100_100",
413
+ "Max_Ops1_Bounds0_100",
414
+ "Max_Ops1_Bounds0_1000",
415
+ "Max_Ops2_Bounds-1000_1000",
416
+ "Max_Ops2_Bounds-100_100",
417
+ "Max_Ops2_Bounds0_100",
418
+ "Max_Ops2_Bounds0_1000",
419
+ "Max_Ops3_Bounds-1000_1000",
420
+ "Max_Ops3_Bounds-100_100",
421
+ "Max_Ops3_Bounds0_100",
422
+ "Max_Ops3_Bounds0_1000",
423
+ "Max_Ops4_Bounds-1000_1000",
424
+ "Max_Ops4_Bounds-100_100",
425
+ "Max_Ops4_Bounds0_100",
426
+ "Max_Ops4_Bounds0_1000",
427
+ "Max_Ops5_Bounds-1000_1000",
428
+ "Max_Ops5_Bounds-100_100",
429
+ "Max_Ops5_Bounds0_100",
430
+ "Max_Ops5_Bounds0_1000",
431
+ "Max_Ops6_Bounds-1000_1000",
432
+ "Max_Ops6_Bounds-100_100",
433
+ "Max_Ops6_Bounds0_100",
434
+ "Max_Ops6_Bounds0_1000",
435
+ "Max_Ops7_Bounds-1000_1000",
436
+ "Max_Ops7_Bounds-100_100",
437
+ "Max_Ops7_Bounds0_100",
438
+ "Max_Ops7_Bounds0_1000",
439
+ "Max_Ops8_Bounds-1000_1000",
440
+ "Max_Ops8_Bounds-100_100",
441
+ "Max_Ops8_Bounds0_100",
442
+ "Max_Ops8_Bounds0_1000",
443
+ "Max_Ops9_Bounds-1000_1000",
444
+ "Max_Ops9_Bounds-100_100",
445
+ "Max_Ops9_Bounds0_100",
446
+ "Max_Ops9_Bounds0_1000",
447
+ "Max_Ops10_Bounds-1000_1000",
448
+ "Max_Ops10_Bounds-100_100",
449
+ "Max_Ops10_Bounds0_100",
450
+ "Max_Ops10_Bounds0_1000"
451
+ ]
452
+ },
453
+ "train": "data/Arithmetic/Curriculum_Hard",
454
+ "val": "data/Arithmetic/Curriculum_Hard",
455
+ "test": "data/Arithmetic/Curriculum_Hard",
456
+ "filling_field": [
457
+ "Question",
458
+ "Answer"
459
+ ]
460
+ },
461
+ "Arithmetic_XHard": {
462
+ "type": "list-like",
463
+ "dataset_purpose": "downstream",
464
+ "attributes": {
465
+ "subjects": [
466
+ 1,
467
+ 2,
468
+ 3,
469
+ 4,
470
+ 5,
471
+ 6,
472
+ 7,
473
+ 8,
474
+ 9
475
+ ],
476
+ "lessons": [
477
+ "Max_Ops10_Bounds0_10000.json",
478
+ "Max_Ops10_Bounds0_1000.json",
479
+ "Max_Ops10_Bounds-10000_10000.json",
480
+ "Max_Ops10_Bounds-1000_1000.json",
481
+ "Max_Ops11_Bounds0_10000.json",
482
+ "Max_Ops11_Bounds0_1000.json",
483
+ "Max_Ops11_Bounds-10000_10000.json",
484
+ "Max_Ops11_Bounds-1000_1000.json",
485
+ "Max_Ops12_Bounds0_10000.json",
486
+ "Max_Ops12_Bounds0_1000.json",
487
+ "Max_Ops12_Bounds-10000_10000.json",
488
+ "Max_Ops12_Bounds-1000_1000.json",
489
+ "Max_Ops13_Bounds0_10000.json",
490
+ "Max_Ops13_Bounds0_1000.json",
491
+ "Max_Ops13_Bounds-10000_10000.json",
492
+ "Max_Ops13_Bounds-1000_1000.json",
493
+ "Max_Ops14_Bounds0_10000.json",
494
+ "Max_Ops14_Bounds0_1000.json",
495
+ "Max_Ops14_Bounds-10000_10000.json",
496
+ "Max_Ops14_Bounds-1000_1000.json",
497
+ "Max_Ops15_Bounds0_10000.json",
498
+ "Max_Ops15_Bounds0_1000.json",
499
+ "Max_Ops15_Bounds-10000_10000.json",
500
+ "Max_Ops15_Bounds-1000_1000.json",
501
+ "Max_Ops16_Bounds0_10000.json",
502
+ "Max_Ops16_Bounds0_1000.json",
503
+ "Max_Ops16_Bounds-10000_10000.json",
504
+ "Max_Ops16_Bounds-1000_1000.json",
505
+ "Max_Ops17_Bounds0_10000.json",
506
+ "Max_Ops17_Bounds0_1000.json",
507
+ "Max_Ops17_Bounds-10000_10000.json",
508
+ "Max_Ops17_Bounds-1000_1000.json",
509
+ "Max_Ops18_Bounds0_10000.json",
510
+ "Max_Ops18_Bounds0_1000.json",
511
+ "Max_Ops18_Bounds-10000_10000.json",
512
+ "Max_Ops18_Bounds-1000_1000.json",
513
+ "Max_Ops19_Bounds0_10000.json",
514
+ "Max_Ops19_Bounds0_1000.json",
515
+ "Max_Ops19_Bounds-10000_10000.json",
516
+ "Max_Ops19_Bounds-1000_1000.json",
517
+ "Max_Ops1_Bounds0_10000.json",
518
+ "Max_Ops1_Bounds0_1000.json",
519
+ "Max_Ops1_Bounds-10000_10000.json",
520
+ "Max_Ops1_Bounds-1000_1000.json",
521
+ "Max_Ops20_Bounds0_10000.json",
522
+ "Max_Ops20_Bounds0_1000.json",
523
+ "Max_Ops20_Bounds-10000_10000.json",
524
+ "Max_Ops20_Bounds-1000_1000.json",
525
+ "Max_Ops2_Bounds0_10000.json",
526
+ "Max_Ops2_Bounds0_1000.json",
527
+ "Max_Ops2_Bounds-10000_10000.json",
528
+ "Max_Ops2_Bounds-1000_1000.json",
529
+ "Max_Ops3_Bounds0_10000.json",
530
+ "Max_Ops3_Bounds0_1000.json",
531
+ "Max_Ops3_Bounds-10000_10000.json",
532
+ "Max_Ops3_Bounds-1000_1000.json",
533
+ "Max_Ops4_Bounds0_10000.json",
534
+ "Max_Ops4_Bounds0_1000.json",
535
+ "Max_Ops4_Bounds-10000_10000.json",
536
+ "Max_Ops4_Bounds-1000_1000.json",
537
+ "Max_Ops5_Bounds0_10000.json",
538
+ "Max_Ops5_Bounds0_1000.json",
539
+ "Max_Ops5_Bounds-10000_10000.json",
540
+ "Max_Ops5_Bounds-1000_1000.json",
541
+ "Max_Ops6_Bounds0_10000.json",
542
+ "Max_Ops6_Bounds0_1000.json",
543
+ "Max_Ops6_Bounds-10000_10000.json",
544
+ "Max_Ops6_Bounds-1000_1000.json",
545
+ "Max_Ops7_Bounds0_10000.json",
546
+ "Max_Ops7_Bounds0_1000.json",
547
+ "Max_Ops7_Bounds-10000_10000.json",
548
+ "Max_Ops7_Bounds-1000_1000.json",
549
+ "Max_Ops8_Bounds0_10000.json",
550
+ "Max_Ops8_Bounds0_1000.json",
551
+ "Max_Ops8_Bounds-10000_10000.json",
552
+ "Max_Ops8_Bounds-1000_1000.json",
553
+ "Max_Ops9_Bounds0_10000.json",
554
+ "Max_Ops9_Bounds0_1000.json",
555
+ "Max_Ops9_Bounds-10000_10000.json",
556
+ "Max_Ops9_Bounds-1000_1000.json"
557
+ ]
558
+ },
559
+ "train": "data/Arithmetic/Curriculum_XHard",
560
+ "val": "data/Arithmetic/Curriculum_XHard",
561
+ "test": "data/Arithmetic/Curriculum_XHard",
562
+ "filling_field": [
563
+ "Question",
564
+ "Answer"
565
+ ]
566
+ },
567
+ "GSM8K": {
568
+ "type": "local",
569
+ "dataset_purpose": "downstream",
570
+ "train_file": "data/GSM8K/GSM8K_train.json",
571
+ "val_file": "data/GSM8K/GSM8K_test.json",
572
+ "test_file": "data/GSM8K/GSM8K_dev.json",
573
+ "filling_field": [
574
+ "Body",
575
+ "Question",
576
+ "Answer"
577
+ ]
578
+ },
579
+ "APPS": {
580
+ "type": "local",
581
+ "dataset_purpose": "downstream",
582
+ "train_file": "data/APPS/apps_train.json",
583
+ "val_file": "data/APPS/apps_test.json",
584
+ "test_file": "data/APPS/apps_dev.json",
585
+ "filling_field": [
586
+ "Body",
587
+ "Question",
588
+ "Answer"
589
+ ]
590
+ },
591
+ "ghcode_python": {
592
+ "type": "huggingface",
593
+ "dataset_purpose": "pretrain",
594
+ "name": "slseanwu/ghcode_python_split_700k",
595
+ "max_eval_size": 1000,
596
+ "max_train_size": 160000,
597
+ "filling_field": [
598
+ "code"
599
+ ]
600
+ }
601
+ }
602
+ }
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2be67f6aac8e482bb2022409709d8774ffb125292c0c9cf025c0ae747f3a6d57
3
+ size 1064
trainer_state.json ADDED
@@ -0,0 +1,387 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 8.605851979345955,
5
+ "eval_steps": 2500,
6
+ "global_step": 5000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.17,
13
+ "grad_norm": 1.0365092754364014,
14
+ "learning_rate": 1.9600000000000002e-05,
15
+ "loss": 0.4163,
16
+ "step": 100
17
+ },
18
+ {
19
+ "epoch": 0.34,
20
+ "grad_norm": 1.212004542350769,
21
+ "learning_rate": 1.9200000000000003e-05,
22
+ "loss": 0.3793,
23
+ "step": 200
24
+ },
25
+ {
26
+ "epoch": 0.52,
27
+ "grad_norm": 1.066266417503357,
28
+ "learning_rate": 1.88e-05,
29
+ "loss": 0.3682,
30
+ "step": 300
31
+ },
32
+ {
33
+ "epoch": 0.69,
34
+ "grad_norm": 1.322099208831787,
35
+ "learning_rate": 1.8400000000000003e-05,
36
+ "loss": 0.3536,
37
+ "step": 400
38
+ },
39
+ {
40
+ "epoch": 0.86,
41
+ "grad_norm": 0.998599648475647,
42
+ "learning_rate": 1.8e-05,
43
+ "loss": 0.3282,
44
+ "step": 500
45
+ },
46
+ {
47
+ "epoch": 1.03,
48
+ "grad_norm": 1.5098826885223389,
49
+ "learning_rate": 1.76e-05,
50
+ "loss": 0.3092,
51
+ "step": 600
52
+ },
53
+ {
54
+ "epoch": 1.2,
55
+ "grad_norm": 1.05723237991333,
56
+ "learning_rate": 1.72e-05,
57
+ "loss": 0.2248,
58
+ "step": 700
59
+ },
60
+ {
61
+ "epoch": 1.38,
62
+ "grad_norm": 1.0882526636123657,
63
+ "learning_rate": 1.6800000000000002e-05,
64
+ "loss": 0.2239,
65
+ "step": 800
66
+ },
67
+ {
68
+ "epoch": 1.55,
69
+ "grad_norm": 1.1547547578811646,
70
+ "learning_rate": 1.64e-05,
71
+ "loss": 0.2342,
72
+ "step": 900
73
+ },
74
+ {
75
+ "epoch": 1.72,
76
+ "grad_norm": 1.1294739246368408,
77
+ "learning_rate": 1.6000000000000003e-05,
78
+ "loss": 0.2158,
79
+ "step": 1000
80
+ },
81
+ {
82
+ "epoch": 1.89,
83
+ "grad_norm": 0.9624162912368774,
84
+ "learning_rate": 1.5600000000000003e-05,
85
+ "loss": 0.2096,
86
+ "step": 1100
87
+ },
88
+ {
89
+ "epoch": 2.07,
90
+ "grad_norm": 1.1864293813705444,
91
+ "learning_rate": 1.5200000000000002e-05,
92
+ "loss": 0.1787,
93
+ "step": 1200
94
+ },
95
+ {
96
+ "epoch": 2.24,
97
+ "grad_norm": 1.1997874975204468,
98
+ "learning_rate": 1.48e-05,
99
+ "loss": 0.1246,
100
+ "step": 1300
101
+ },
102
+ {
103
+ "epoch": 2.41,
104
+ "grad_norm": 1.2120954990386963,
105
+ "learning_rate": 1.4400000000000001e-05,
106
+ "loss": 0.1197,
107
+ "step": 1400
108
+ },
109
+ {
110
+ "epoch": 2.58,
111
+ "grad_norm": 0.6992385983467102,
112
+ "learning_rate": 1.4e-05,
113
+ "loss": 0.1185,
114
+ "step": 1500
115
+ },
116
+ {
117
+ "epoch": 2.75,
118
+ "grad_norm": 1.0601509809494019,
119
+ "learning_rate": 1.3600000000000002e-05,
120
+ "loss": 0.1241,
121
+ "step": 1600
122
+ },
123
+ {
124
+ "epoch": 2.93,
125
+ "grad_norm": 1.1058382987976074,
126
+ "learning_rate": 1.3200000000000002e-05,
127
+ "loss": 0.1282,
128
+ "step": 1700
129
+ },
130
+ {
131
+ "epoch": 3.1,
132
+ "grad_norm": 1.1598687171936035,
133
+ "learning_rate": 1.2800000000000001e-05,
134
+ "loss": 0.0847,
135
+ "step": 1800
136
+ },
137
+ {
138
+ "epoch": 3.27,
139
+ "grad_norm": 1.2096168994903564,
140
+ "learning_rate": 1.2400000000000002e-05,
141
+ "loss": 0.0616,
142
+ "step": 1900
143
+ },
144
+ {
145
+ "epoch": 3.44,
146
+ "grad_norm": 1.5343897342681885,
147
+ "learning_rate": 1.2e-05,
148
+ "loss": 0.0645,
149
+ "step": 2000
150
+ },
151
+ {
152
+ "epoch": 3.61,
153
+ "grad_norm": 1.165819764137268,
154
+ "learning_rate": 1.16e-05,
155
+ "loss": 0.0652,
156
+ "step": 2100
157
+ },
158
+ {
159
+ "epoch": 3.79,
160
+ "grad_norm": 1.3763171434402466,
161
+ "learning_rate": 1.1200000000000001e-05,
162
+ "loss": 0.0619,
163
+ "step": 2200
164
+ },
165
+ {
166
+ "epoch": 3.96,
167
+ "grad_norm": 0.9929534792900085,
168
+ "learning_rate": 1.0800000000000002e-05,
169
+ "loss": 0.0612,
170
+ "step": 2300
171
+ },
172
+ {
173
+ "epoch": 4.13,
174
+ "grad_norm": 1.1144566535949707,
175
+ "learning_rate": 1.04e-05,
176
+ "loss": 0.038,
177
+ "step": 2400
178
+ },
179
+ {
180
+ "epoch": 4.3,
181
+ "grad_norm": 1.150139570236206,
182
+ "learning_rate": 1e-05,
183
+ "loss": 0.0311,
184
+ "step": 2500
185
+ },
186
+ {
187
+ "epoch": 4.3,
188
+ "eval_loss": 0.35374003648757935,
189
+ "eval_runtime": 84.8171,
190
+ "eval_samples_per_second": 11.79,
191
+ "eval_steps_per_second": 2.948,
192
+ "step": 2500
193
+ },
194
+ {
195
+ "epoch": 4.48,
196
+ "grad_norm": 1.4293252229690552,
197
+ "learning_rate": 9.600000000000001e-06,
198
+ "loss": 0.0308,
199
+ "step": 2600
200
+ },
201
+ {
202
+ "epoch": 4.65,
203
+ "grad_norm": 1.1352962255477905,
204
+ "learning_rate": 9.200000000000002e-06,
205
+ "loss": 0.0308,
206
+ "step": 2700
207
+ },
208
+ {
209
+ "epoch": 4.82,
210
+ "grad_norm": 1.0544779300689697,
211
+ "learning_rate": 8.8e-06,
212
+ "loss": 0.033,
213
+ "step": 2800
214
+ },
215
+ {
216
+ "epoch": 4.99,
217
+ "grad_norm": 1.110599160194397,
218
+ "learning_rate": 8.400000000000001e-06,
219
+ "loss": 0.0318,
220
+ "step": 2900
221
+ },
222
+ {
223
+ "epoch": 5.16,
224
+ "grad_norm": 0.7125316262245178,
225
+ "learning_rate": 8.000000000000001e-06,
226
+ "loss": 0.0147,
227
+ "step": 3000
228
+ },
229
+ {
230
+ "epoch": 5.34,
231
+ "grad_norm": 0.9172051548957825,
232
+ "learning_rate": 7.600000000000001e-06,
233
+ "loss": 0.0156,
234
+ "step": 3100
235
+ },
236
+ {
237
+ "epoch": 5.51,
238
+ "grad_norm": 0.9805625081062317,
239
+ "learning_rate": 7.2000000000000005e-06,
240
+ "loss": 0.0145,
241
+ "step": 3200
242
+ },
243
+ {
244
+ "epoch": 5.68,
245
+ "grad_norm": 0.5053761601448059,
246
+ "learning_rate": 6.800000000000001e-06,
247
+ "loss": 0.0149,
248
+ "step": 3300
249
+ },
250
+ {
251
+ "epoch": 5.85,
252
+ "grad_norm": 1.1218398809432983,
253
+ "learning_rate": 6.4000000000000006e-06,
254
+ "loss": 0.0168,
255
+ "step": 3400
256
+ },
257
+ {
258
+ "epoch": 6.02,
259
+ "grad_norm": 0.3119220733642578,
260
+ "learning_rate": 6e-06,
261
+ "loss": 0.0154,
262
+ "step": 3500
263
+ },
264
+ {
265
+ "epoch": 6.2,
266
+ "grad_norm": 0.23416651785373688,
267
+ "learning_rate": 5.600000000000001e-06,
268
+ "loss": 0.0065,
269
+ "step": 3600
270
+ },
271
+ {
272
+ "epoch": 6.37,
273
+ "grad_norm": 0.6167200803756714,
274
+ "learning_rate": 5.2e-06,
275
+ "loss": 0.0079,
276
+ "step": 3700
277
+ },
278
+ {
279
+ "epoch": 6.54,
280
+ "grad_norm": 1.1704833507537842,
281
+ "learning_rate": 4.800000000000001e-06,
282
+ "loss": 0.0067,
283
+ "step": 3800
284
+ },
285
+ {
286
+ "epoch": 6.71,
287
+ "grad_norm": 0.8806678056716919,
288
+ "learning_rate": 4.4e-06,
289
+ "loss": 0.0093,
290
+ "step": 3900
291
+ },
292
+ {
293
+ "epoch": 6.88,
294
+ "grad_norm": 0.30924132466316223,
295
+ "learning_rate": 4.000000000000001e-06,
296
+ "loss": 0.007,
297
+ "step": 4000
298
+ },
299
+ {
300
+ "epoch": 7.06,
301
+ "grad_norm": 0.46306928992271423,
302
+ "learning_rate": 3.6000000000000003e-06,
303
+ "loss": 0.0052,
304
+ "step": 4100
305
+ },
306
+ {
307
+ "epoch": 7.23,
308
+ "grad_norm": 0.46887511014938354,
309
+ "learning_rate": 3.2000000000000003e-06,
310
+ "loss": 0.0042,
311
+ "step": 4200
312
+ },
313
+ {
314
+ "epoch": 7.4,
315
+ "grad_norm": 0.902063250541687,
316
+ "learning_rate": 2.8000000000000003e-06,
317
+ "loss": 0.0031,
318
+ "step": 4300
319
+ },
320
+ {
321
+ "epoch": 7.57,
322
+ "grad_norm": 0.1910380870103836,
323
+ "learning_rate": 2.4000000000000003e-06,
324
+ "loss": 0.0029,
325
+ "step": 4400
326
+ },
327
+ {
328
+ "epoch": 7.75,
329
+ "grad_norm": 0.6202380657196045,
330
+ "learning_rate": 2.0000000000000003e-06,
331
+ "loss": 0.0032,
332
+ "step": 4500
333
+ },
334
+ {
335
+ "epoch": 7.92,
336
+ "grad_norm": 0.5730396509170532,
337
+ "learning_rate": 1.6000000000000001e-06,
338
+ "loss": 0.0034,
339
+ "step": 4600
340
+ },
341
+ {
342
+ "epoch": 8.09,
343
+ "grad_norm": 0.10635427385568619,
344
+ "learning_rate": 1.2000000000000002e-06,
345
+ "loss": 0.0034,
346
+ "step": 4700
347
+ },
348
+ {
349
+ "epoch": 8.26,
350
+ "grad_norm": 0.1567939668893814,
351
+ "learning_rate": 8.000000000000001e-07,
352
+ "loss": 0.0027,
353
+ "step": 4800
354
+ },
355
+ {
356
+ "epoch": 8.43,
357
+ "grad_norm": 0.11498889327049255,
358
+ "learning_rate": 4.0000000000000003e-07,
359
+ "loss": 0.0015,
360
+ "step": 4900
361
+ },
362
+ {
363
+ "epoch": 8.61,
364
+ "grad_norm": 0.09903218597173691,
365
+ "learning_rate": 0.0,
366
+ "loss": 0.0017,
367
+ "step": 5000
368
+ },
369
+ {
370
+ "epoch": 8.61,
371
+ "eval_loss": 0.4493824243545532,
372
+ "eval_runtime": 84.77,
373
+ "eval_samples_per_second": 11.797,
374
+ "eval_steps_per_second": 2.949,
375
+ "step": 5000
376
+ }
377
+ ],
378
+ "logging_steps": 100,
379
+ "max_steps": 5000,
380
+ "num_input_tokens_seen": 0,
381
+ "num_train_epochs": 9,
382
+ "save_steps": 2500,
383
+ "total_flos": 1.258569996863275e+18,
384
+ "train_batch_size": 4,
385
+ "trial_name": null,
386
+ "trial_params": null
387
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42f576a53716a0435df09b5afe87ec075766f880338fac95b5faf5bd7ec56a3c
3
+ size 4856
training_logs.txt ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [train dset len] 18590
2
+ [valid dset len] 1000
3
+ /usr0/home/liangzel/anaconda3/envs/air2/lib/python3.11/site-packages/accelerate/accelerator.py:432:
4
+ FutureWarning: Passing the following arguments to `Accelerator` is deprecated
5
+ and will be removed in version 1.0 of Accelerate: dict_keys(['dispatch_batches',
6
+ 'split_batches', 'even_batches', 'use_seedable_sampler']). Please pass an
7
+ `accelerate.DataLoaderConfiguration` instead:
8
+ dataloader_config = DataLoaderConfiguration(dispatch_batches=None,
9
+ split_batches=False, even_batches=True, use_seedable_sampler=True)
10
+ warnings.warn(
11
+ {'loss': 0.4163, 'grad_norm': 1.0365092754364014, 'learning_rate':
12
+ 1.9600000000000002e-05, 'epoch': 0.17}
13
+ 3%|██▍
14
+ | 164/5000 [19:31<9:34:34, 7.13s/it]Token indices sequence length is longer
15
+ than the specified maximum sequence length for this model (24682 > 16384).
16
+ Running this sequence through the model will result in indexing errors
17
+ {'loss': 0.3793, 'grad_norm': 1.212004542350769, 'learning_rate':
18
+ 1.9200000000000003e-05, 'epoch': 0.34}
19
+ {'loss': 0.3682, 'grad_norm': 1.066266417503357, 'learning_rate': 1.88e-05,
20
+ 'epoch': 0.52}
21
+ {'loss': 0.3536, 'grad_norm': 1.322099208831787, 'learning_rate':
22
+ 1.8400000000000003e-05, 'epoch': 0.69}
23
+ {'loss': 0.3282, 'grad_norm': 0.998599648475647, 'learning_rate': 1.8e-05,
24
+ 'epoch': 0.86}
25
+ {'loss': 0.3092, 'grad_norm': 1.5098826885223389, 'learning_rate': 1.76e-05,
26
+ 'epoch': 1.03}
27
+ {'loss': 0.2248, 'grad_norm': 1.05723237991333, 'learning_rate': 1.72e-05,
28
+ 'epoch': 1.2}
29
+ {'loss': 0.2239, 'grad_norm': 1.0882526636123657, 'learning_rate':
30
+ 1.6800000000000002e-05, 'epoch': 1.38}
31
+ {'loss': 0.2342, 'grad_norm': 1.1547547578811646, 'learning_rate': 1.64e-05,
32
+ 'epoch': 1.55}
33
+ {'loss': 0.2158, 'grad_norm': 1.1294739246368408, 'learning_rate':
34
+ 1.6000000000000003e-05, 'epoch': 1.72}
35
+ {'loss': 0.2096, 'grad_norm': 0.9624162912368774, 'learning_rate':
36
+ 1.5600000000000003e-05, 'epoch': 1.89}
37
+ {'loss': 0.1787, 'grad_norm': 1.1864293813705444, 'learning_rate':
38
+ 1.5200000000000002e-05, 'epoch': 2.07}
39
+ {'loss': 0.1246, 'grad_norm': 1.1997874975204468, 'learning_rate': 1.48e-05,
40
+ 'epoch': 2.24}
41
+ {'loss': 0.1197, 'grad_norm': 1.2120954990386963, 'learning_rate':
42
+ 1.4400000000000001e-05, 'epoch': 2.41}
43
+ {'loss': 0.1185, 'grad_norm': 0.6992385983467102, 'learning_rate': 1.4e-05,
44
+ 'epoch': 2.58}
45
+ {'loss': 0.1241, 'grad_norm': 1.0601509809494019, 'learning_rate':
46
+ 1.3600000000000002e-05, 'epoch': 2.75}
47
+ {'loss': 0.1282, 'grad_norm': 1.1058382987976074, 'learning_rate':
48
+ 1.3200000000000002e-05, 'epoch': 2.93}
49
+ {'loss': 0.0847, 'grad_norm': 1.1598687171936035, 'learning_rate':
50
+ 1.2800000000000001e-05, 'epoch': 3.1}
51
+ {'loss': 0.0616, 'grad_norm': 1.2096168994903564, 'learning_rate':
52
+ 1.2400000000000002e-05, 'epoch': 3.27}
53
+ {'loss': 0.0645, 'grad_norm': 1.5343897342681885, 'learning_rate': 1.2e-05,
54
+ 'epoch': 3.44}
55
+ {'loss': 0.0652, 'grad_norm': 1.165819764137268, 'learning_rate': 1.16e-05,
56
+ 'epoch': 3.61}
57
+ {'loss': 0.0619, 'grad_norm': 1.3763171434402466, 'learning_rate':
58
+ 1.1200000000000001e-05, 'epoch': 3.79}
59
+ {'loss': 0.0612, 'grad_norm': 0.9929534792900085, 'learning_rate':
60
+ 1.0800000000000002e-05, 'epoch': 3.96}
61
+ {'loss': 0.038, 'grad_norm': 1.1144566535949707, 'learning_rate': 1.04e-05,
62
+ 'epoch': 4.13}
63
+ {'loss': 0.0311, 'grad_norm': 1.150139570236206, 'learning_rate': 1e-05,
64
+ 'epoch': 4.3}
65
+ {'eval_loss': 0.35374003648757935, 'eval_runtime': 84.8171,
66
+ 'eval_samples_per_second': 11.79, 'eval_steps_per_second': 2.948, 'epoch':
67
+ 4.3}
68
+ {'loss': 0.0308, 'grad_norm': 1.4293252229690552, 'learning_rate':
69
+ 9.600000000000001e-06, 'epoch': 4.48}
70
+ {'loss': 0.0308, 'grad_norm': 1.1352962255477905, 'learning_rate':
71
+ 9.200000000000002e-06, 'epoch': 4.65}
72
+ {'loss': 0.033, 'grad_norm': 1.0544779300689697, 'learning_rate': 8.8e-06,
73
+ 'epoch': 4.82}
74
+ {'loss': 0.0318, 'grad_norm': 1.110599160194397, 'learning_rate':
75
+ 8.400000000000001e-06, 'epoch': 4.99}
76
+ {'loss': 0.0147, 'grad_norm': 0.7125316262245178, 'learning_rate':
77
+ 8.000000000000001e-06, 'epoch': 5.16}
78
+ {'loss': 0.0156, 'grad_norm': 0.9172051548957825, 'learning_rate':
79
+ 7.600000000000001e-06, 'epoch': 5.34}
80
+ {'loss': 0.0145, 'grad_norm': 0.9805625081062317, 'learning_rate':
81
+ 7.2000000000000005e-06, 'epoch': 5.51}
82
+ {'loss': 0.0149, 'grad_norm': 0.5053761601448059, 'learning_rate':
83
+ 6.800000000000001e-06, 'epoch': 5.68}
84
+ {'loss': 0.0168, 'grad_norm': 1.1218398809432983, 'learning_rate':
85
+ 6.4000000000000006e-06, 'epoch': 5.85}
86
+ {'loss': 0.0154, 'grad_norm': 0.3119220733642578, 'learning_rate': 6e-06,
87
+ 'epoch': 6.02}
88
+ {'loss': 0.0065, 'grad_norm': 0.23416651785373688, 'learning_rate':
89
+ 5.600000000000001e-06, 'epoch': 6.2}
90
+ {'loss': 0.0079, 'grad_norm': 0.6167200803756714, 'learning_rate': 5.2e-06,
91
+ 'epoch': 6.37}
92
+ {'loss': 0.0067, 'grad_norm': 1.1704833507537842, 'learning_rate':
93
+ 4.800000000000001e-06, 'epoch': 6.54}
94
+ {'loss': 0.0093, 'grad_norm': 0.8806678056716919, 'learning_rate': 4.4e-06,
95
+ 'epoch': 6.71}
96
+ {'loss': 0.007, 'grad_norm': 0.30924132466316223, 'learning_rate':
97
+ 4.000000000000001e-06, 'epoch': 6.88}
98
+ {'loss': 0.0052, 'grad_norm': 0.46306928992271423, 'learning_rate':
99
+ 3.6000000000000003e-06, 'epoch': 7.06}
100
+ {'loss': 0.0042, 'grad_norm': 0.46887511014938354, 'learning_rate':
101
+ 3.2000000000000003e-06, 'epoch': 7.23}
102
+ {'loss': 0.0031, 'grad_norm': 0.902063250541687, 'learning_rate':
103
+ 2.8000000000000003e-06, 'epoch': 7.4}
104
+ {'loss': 0.0029, 'grad_norm': 0.1910380870103836, 'learning_rate':
105
+ 2.4000000000000003e-06, 'epoch': 7.57}
106
+ {'loss': 0.0032, 'grad_norm': 0.6202380657196045, 'learning_rate':
107
+ 2.0000000000000003e-06, 'epoch': 7.75}
108
+ {'loss': 0.0034, 'grad_norm': 0.5730396509170532, 'learning_rate':
109
+ 1.6000000000000001e-06, 'epoch': 7.92}
110
+ {'loss': 0.0034, 'grad_norm': 0.10635427385568619, 'learning_rate':
111
+ 1.2000000000000002e-06, 'epoch': 8.09}
112
+ {'loss': 0.0027, 'grad_norm': 0.1567939668893814, 'learning_rate':
113
+ 8.000000000000001e-07, 'epoch': 8.26}
114
+ {'loss': 0.0015, 'grad_norm': 0.11498889327049255, 'learning_rate':
115
+ 4.0000000000000003e-07, 'epoch': 8.43}
116
+ {'loss': 0.0017, 'grad_norm': 0.09903218597173691, 'learning_rate': 0.0,
117
+ 'epoch': 8.61}
118
+ {'eval_loss': 0.4493824243545532, 'eval_runtime': 84.77,
119
+ 'eval_samples_per_second': 11.797, 'eval_steps_per_second': 2.949, 'epoch':
120
+ 8.61}
121
+ {'train_runtime': 35516.5263, 'train_samples_per_second': 4.505,
122
+ 'train_steps_per_second': 0.141, 'train_loss': 0.09624048218131065, 'epoch':
123
+ 8.61}
124
+