alicegoesdown commited on
Commit
1fc1f07
·
verified ·
1 Parent(s): d834376

Training in progress, step 150, checkpoint

Browse files
last-checkpoint/README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- base_model: bigscience/bloomz-560m
3
  library_name: peft
4
  ---
5
 
 
1
  ---
2
+ base_model: unsloth/mistral-7b-v0.2-bnb-4bit
3
  library_name: peft
4
  ---
5
 
last-checkpoint/adapter_config.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "alpha_pattern": {},
3
  "auto_mapping": null,
4
- "base_model_name_or_path": "bigscience/bloomz-560m",
5
  "bias": "none",
6
  "fan_in_fan_out": false,
7
  "inference_mode": true,
@@ -14,20 +14,151 @@
14
  "lora_dropout": 0.3,
15
  "megatron_config": null,
16
  "megatron_core": "megatron.core",
17
- "modules_to_save": null,
 
 
18
  "peft_type": "LORA",
19
  "r": 4,
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
- "dense",
24
- "query_key_value",
25
- "lm_head",
26
- "word_embeddings",
27
- "dense_4h_to_h",
28
- "dense_h_to_4h"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  ],
30
  "task_type": "CAUSAL_LM",
31
  "use_dora": false,
32
- "use_rslora": false
33
  }
 
1
  {
2
  "alpha_pattern": {},
3
  "auto_mapping": null,
4
+ "base_model_name_or_path": "unsloth/mistral-7b-v0.2-bnb-4bit",
5
  "bias": "none",
6
  "fan_in_fan_out": false,
7
  "inference_mode": true,
 
14
  "lora_dropout": 0.3,
15
  "megatron_config": null,
16
  "megatron_core": "megatron.core",
17
+ "modules_to_save": [
18
+ "lm_head"
19
+ ],
20
  "peft_type": "LORA",
21
  "r": 4,
22
  "rank_pattern": {},
23
  "revision": null,
24
  "target_modules": [
25
+ "model.layers.27.self_attn.v_proj",
26
+ "model.layers.28.mlp.down_proj",
27
+ "model.layers.12.self_attn.q_proj",
28
+ "model.layers.28.self_attn.o_proj",
29
+ "model.layers.9.self_attn.v_proj",
30
+ "model.layers.14.self_attn.q_proj",
31
+ "model.layers.19.self_attn.q_proj",
32
+ "model.layers.27.mlp.gate_proj",
33
+ "model.layers.17.self_attn.o_proj",
34
+ "model.layers.23.self_attn.o_proj",
35
+ "model.layers.22.self_attn.v_proj",
36
+ "model.layers.25.self_attn.o_proj",
37
+ "model.layers.0.mlp.down_proj",
38
+ "model.layers.20.mlp.gate_proj",
39
+ "model.layers.9.mlp.gate_proj",
40
+ "model.layers.18.self_attn.k_proj",
41
+ "model.layers.21.self_attn.k_proj",
42
+ "model.layers.17.self_attn.k_proj",
43
+ "model.layers.3.self_attn.q_proj",
44
+ "model.layers.24.self_attn.q_proj",
45
+ "model.layers.26.mlp.up_proj",
46
+ "model.layers.31.mlp.up_proj",
47
+ "model.layers.1.mlp.down_proj",
48
+ "model.layers.13.mlp.up_proj",
49
+ "model.layers.5.mlp.down_proj",
50
+ "model.layers.3.self_attn.o_proj",
51
+ "model.layers.29.self_attn.k_proj",
52
+ "model.layers.3.mlp.down_proj",
53
+ "model.layers.20.mlp.down_proj",
54
+ "model.layers.2.mlp.gate_proj",
55
+ "model.layers.16.self_attn.q_proj",
56
+ "model.layers.14.mlp.down_proj",
57
+ "model.layers.6.self_attn.o_proj",
58
+ "model.layers.26.self_attn.o_proj",
59
+ "model.layers.6.self_attn.k_proj",
60
+ "model.layers.21.mlp.down_proj",
61
+ "model.layers.9.mlp.down_proj",
62
+ "model.layers.31.self_attn.o_proj",
63
+ "model.layers.7.self_attn.k_proj",
64
+ "model.layers.12.self_attn.o_proj",
65
+ "model.layers.14.mlp.gate_proj",
66
+ "model.layers.8.mlp.down_proj",
67
+ "model.layers.16.self_attn.o_proj",
68
+ "model.layers.15.self_attn.k_proj",
69
+ "model.layers.21.self_attn.o_proj",
70
+ "model.layers.13.self_attn.v_proj",
71
+ "model.layers.7.mlp.down_proj",
72
+ "model.layers.21.self_attn.v_proj",
73
+ "model.layers.27.self_attn.k_proj",
74
+ "model.layers.18.self_attn.v_proj",
75
+ "model.layers.15.self_attn.q_proj",
76
+ "model.layers.1.self_attn.k_proj",
77
+ "model.layers.5.mlp.gate_proj",
78
+ "model.layers.2.self_attn.q_proj",
79
+ "model.layers.28.mlp.up_proj",
80
+ "model.layers.10.mlp.gate_proj",
81
+ "model.layers.29.mlp.up_proj",
82
+ "model.layers.7.self_attn.q_proj",
83
+ "model.layers.12.mlp.gate_proj",
84
+ "model.layers.20.mlp.up_proj",
85
+ "model.layers.4.mlp.up_proj",
86
+ "model.layers.23.self_attn.q_proj",
87
+ "model.layers.23.self_attn.v_proj",
88
+ "model.layers.1.self_attn.q_proj",
89
+ "model.layers.27.mlp.down_proj",
90
+ "model.layers.30.mlp.down_proj",
91
+ "model.layers.17.mlp.down_proj",
92
+ "model.layers.25.mlp.up_proj",
93
+ "model.layers.4.self_attn.k_proj",
94
+ "model.layers.2.mlp.up_proj",
95
+ "model.layers.6.mlp.gate_proj",
96
+ "model.layers.31.self_attn.v_proj",
97
+ "model.layers.30.self_attn.k_proj",
98
+ "model.layers.30.self_attn.q_proj",
99
+ "model.layers.30.mlp.gate_proj",
100
+ "model.layers.8.mlp.gate_proj",
101
+ "model.layers.13.mlp.down_proj",
102
+ "model.layers.10.self_attn.q_proj",
103
+ "model.layers.22.self_attn.q_proj",
104
+ "model.layers.15.self_attn.v_proj",
105
+ "model.layers.25.mlp.gate_proj",
106
+ "model.layers.4.self_attn.q_proj",
107
+ "model.embed_tokens",
108
+ "model.layers.14.self_attn.k_proj",
109
+ "model.layers.31.mlp.down_proj",
110
+ "model.layers.31.self_attn.q_proj",
111
+ "model.layers.1.mlp.up_proj",
112
+ "model.layers.2.mlp.down_proj",
113
+ "model.layers.12.mlp.up_proj",
114
+ "model.layers.19.mlp.up_proj",
115
+ "model.layers.0.self_attn.v_proj",
116
+ "model.layers.8.self_attn.v_proj",
117
+ "model.layers.5.self_attn.o_proj",
118
+ "model.layers.11.mlp.gate_proj",
119
+ "model.layers.18.mlp.up_proj",
120
+ "model.layers.10.self_attn.v_proj",
121
+ "model.layers.23.mlp.down_proj",
122
+ "model.layers.0.mlp.up_proj",
123
+ "model.layers.8.self_attn.k_proj",
124
+ "model.layers.13.mlp.gate_proj",
125
+ "model.layers.24.self_attn.k_proj",
126
+ "model.layers.24.mlp.down_proj",
127
+ "model.layers.15.mlp.up_proj",
128
+ "model.layers.25.self_attn.k_proj",
129
+ "model.layers.6.self_attn.v_proj",
130
+ "model.layers.29.self_attn.o_proj",
131
+ "model.layers.10.self_attn.o_proj",
132
+ "model.layers.31.mlp.gate_proj",
133
+ "model.layers.30.mlp.up_proj",
134
+ "model.layers.16.mlp.down_proj",
135
+ "model.layers.30.self_attn.o_proj",
136
+ "model.layers.7.self_attn.o_proj",
137
+ "model.layers.5.self_attn.k_proj",
138
+ "model.layers.0.self_attn.q_proj",
139
+ "model.layers.16.self_attn.k_proj",
140
+ "model.layers.19.mlp.down_proj",
141
+ "model.layers.17.self_attn.v_proj",
142
+ "model.layers.29.self_attn.v_proj",
143
+ "model.layers.9.self_attn.q_proj",
144
+ "model.layers.26.self_attn.k_proj",
145
+ "model.layers.11.self_attn.q_proj",
146
+ "model.layers.24.self_attn.v_proj",
147
+ "model.layers.18.mlp.down_proj",
148
+ "model.layers.11.self_attn.v_proj",
149
+ "model.layers.19.self_attn.k_proj",
150
+ "model.layers.26.self_attn.v_proj",
151
+ "model.layers.28.mlp.gate_proj",
152
+ "model.layers.22.self_attn.k_proj",
153
+ "model.layers.31.self_attn.k_proj",
154
+ "model.layers.3.mlp.gate_proj",
155
+ "model.layers.22.mlp.up_proj",
156
+ "model.layers.30.self_attn.v_proj",
157
+ "model.layers.4.mlp.down_proj",
158
+ "model.layers.20.self_attn.k_proj",
159
+ "model.layers.11.self_attn.o_proj"
160
  ],
161
  "task_type": "CAUSAL_LM",
162
  "use_dora": false,
163
+ "use_rslora": true
164
  }
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2a2cbff943a923aeb62f4456b7f17ef0837662872b71b7c08d6fb4270a8dc1f3
3
- size 2069589712
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18a0ef7164d47257fa5e03e94a378e8b160912c89dea19a8e1f7af4d4f485094
3
+ size 287644488
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:673995a3f8570aefa08233c28e15e70e4fd1e17563f71fbab930b3b315e2cb24
3
- size 28869370
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83bab8051a975e48fe6b37fa4732978d04aa6b82c8f1f55d13ddf5d2c07b7d75
3
+ size 574288634
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:80e38744464beac4bc1528ae979524f096700fba5a31f0a335a53cc934b12ee8
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42f2102978cbe9c56d5a8a027ea80c143b48f9e1e590c1c68cf2a3125de2e811
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3c11ec2bb991f72e1a5d66ad02397dfaffa9ff7006a8f5dd335aa86b56dc8c92
3
- size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fac40e05bb032b9491fbd1b7ca84c8c3e5be3358f97f841daf2ab09e4beb7c6b
3
+ size 1256
last-checkpoint/special_tokens_map.json CHANGED
@@ -14,7 +14,7 @@
14
  "single_word": false
15
  },
16
  "pad_token": {
17
- "content": "<pad>",
18
  "lstrip": false,
19
  "normalized": false,
20
  "rstrip": false,
 
14
  "single_word": false
15
  },
16
  "pad_token": {
17
+ "content": "<unk>",
18
  "lstrip": false,
19
  "normalized": false,
20
  "rstrip": false,
last-checkpoint/tokenizer.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d963066d6adae5034a1dc114c3ac444512de09928cf14ed4562ba94d9a440e66
3
- size 21763085
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2699839c243202a04a90537576fc719283f638e90fe80feb469888275289575
3
+ size 3505751
last-checkpoint/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
3
+ size 493443
last-checkpoint/tokenizer_config.json CHANGED
@@ -1,5 +1,7 @@
1
  {
2
- "add_prefix_space": false,
 
 
3
  "added_tokens_decoder": {
4
  "0": {
5
  "content": "<unk>",
@@ -24,25 +26,19 @@
24
  "rstrip": false,
25
  "single_word": false,
26
  "special": true
27
- },
28
- "3": {
29
- "content": "<pad>",
30
- "lstrip": false,
31
- "normalized": false,
32
- "rstrip": false,
33
- "single_word": false,
34
- "special": true
35
  }
36
  },
37
  "bos_token": "<s>",
38
  "clean_up_tokenization_spaces": false,
39
  "eos_token": "</s>",
40
  "extra_special_tokens": {},
41
- "merges_file": null,
42
  "model_max_length": 1000000000000000019884624838656,
43
- "pad_token": "<pad>",
44
  "padding_side": "left",
45
- "tokenizer_class": "BloomTokenizer",
 
 
46
  "unk_token": "<unk>",
47
- "vocab_file": null
48
  }
 
1
  {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": true,
5
  "added_tokens_decoder": {
6
  "0": {
7
  "content": "<unk>",
 
26
  "rstrip": false,
27
  "single_word": false,
28
  "special": true
 
 
 
 
 
 
 
 
29
  }
30
  },
31
  "bos_token": "<s>",
32
  "clean_up_tokenization_spaces": false,
33
  "eos_token": "</s>",
34
  "extra_special_tokens": {},
35
+ "legacy": true,
36
  "model_max_length": 1000000000000000019884624838656,
37
+ "pad_token": "<unk>",
38
  "padding_side": "left",
39
+ "sp_model_kwargs": {},
40
+ "spaces_between_special_tokens": false,
41
+ "tokenizer_class": "LlamaTokenizer",
42
  "unk_token": "<unk>",
43
+ "use_default_system_prompt": false
44
  }
last-checkpoint/trainer_state.json CHANGED
@@ -1,803 +1,125 @@
1
  {
2
- "best_metric": 1.628559947013855,
3
- "best_model_checkpoint": "./output/checkpoint-1050",
4
- "epoch": 29.166666666666668,
5
  "eval_steps": 150,
6
- "global_step": 1050,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.2777777777777778,
13
- "grad_norm": 3067.291259765625,
14
- "learning_rate": 5.000000000000001e-07,
15
- "loss": 5.7343,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.5555555555555556,
20
- "grad_norm": 454.0848083496094,
21
- "learning_rate": 1.0000000000000002e-06,
22
- "loss": 4.3591,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.8333333333333334,
27
- "grad_norm": 6852.0712890625,
28
- "learning_rate": 1.5e-06,
29
- "loss": 7.8247,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 1.1111111111111112,
34
- "grad_norm": 86.0095443725586,
35
- "learning_rate": 2.0000000000000003e-06,
36
- "loss": 4.2165,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 1.3888888888888888,
41
- "grad_norm": 174.20822143554688,
42
- "learning_rate": 2.5e-06,
43
- "loss": 6.9714,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 1.6666666666666665,
48
- "grad_norm": 292.1271057128906,
49
- "learning_rate": 3e-06,
50
- "loss": 7.0917,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 1.9444444444444444,
55
- "grad_norm": 301.70904541015625,
56
- "learning_rate": 3.5e-06,
57
- "loss": 4.4748,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 2.2222222222222223,
62
- "grad_norm": 117.82996368408203,
63
- "learning_rate": 4.000000000000001e-06,
64
- "loss": 4.6878,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 2.5,
69
- "grad_norm": 141.16262817382812,
70
- "learning_rate": 4.5e-06,
71
- "loss": 4.0654,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 2.7777777777777777,
76
- "grad_norm": 96.45869445800781,
77
- "learning_rate": 5e-06,
78
- "loss": 5.5673,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 3.0555555555555554,
83
- "grad_norm": 99.14539337158203,
84
- "learning_rate": 4.999948617395916e-06,
85
- "loss": 4.7921,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 3.3333333333333335,
90
- "grad_norm": 3432.715576171875,
91
- "learning_rate": 4.9997944716957985e-06,
92
- "loss": 4.7846,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 3.611111111111111,
97
- "grad_norm": 104.461181640625,
98
- "learning_rate": 4.999537569235975e-06,
99
- "loss": 3.8068,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 3.888888888888889,
104
- "grad_norm": 167.98757934570312,
105
- "learning_rate": 4.9991779205767e-06,
106
- "loss": 3.5173,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 4.166666666666667,
111
- "grad_norm": 732.2133178710938,
112
- "learning_rate": 4.99871554050172e-06,
113
- "loss": 4.0311,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 4.166666666666667,
118
- "eval_loss": 3.2265913486480713,
119
- "eval_runtime": 0.3402,
120
- "eval_samples_per_second": 26.456,
121
- "eval_steps_per_second": 26.456,
122
  "step": 150
123
- },
124
- {
125
- "epoch": 4.444444444444445,
126
- "grad_norm": 101.75000762939453,
127
- "learning_rate": 4.99815044801767e-06,
128
- "loss": 3.9084,
129
- "step": 160
130
- },
131
- {
132
- "epoch": 4.722222222222222,
133
- "grad_norm": 68.58529663085938,
134
- "learning_rate": 4.997482666353287e-06,
135
- "loss": 3.6879,
136
- "step": 170
137
- },
138
- {
139
- "epoch": 5.0,
140
- "grad_norm": 75.727294921875,
141
- "learning_rate": 4.9967122229584614e-06,
142
- "loss": 3.6844,
143
- "step": 180
144
- },
145
- {
146
- "epoch": 5.277777777777778,
147
- "grad_norm": 191.76918029785156,
148
- "learning_rate": 4.995839149503103e-06,
149
- "loss": 3.9383,
150
- "step": 190
151
- },
152
- {
153
- "epoch": 5.555555555555555,
154
- "grad_norm": 72.1632080078125,
155
- "learning_rate": 4.994863481875842e-06,
156
- "loss": 3.5542,
157
- "step": 200
158
- },
159
- {
160
- "epoch": 5.833333333333333,
161
- "grad_norm": 158.5887908935547,
162
- "learning_rate": 4.993785260182552e-06,
163
- "loss": 3.4056,
164
- "step": 210
165
- },
166
- {
167
- "epoch": 6.111111111111111,
168
- "grad_norm": 89.98544311523438,
169
- "learning_rate": 4.992604528744705e-06,
170
- "loss": 3.5972,
171
- "step": 220
172
- },
173
- {
174
- "epoch": 6.388888888888889,
175
- "grad_norm": 124.9444351196289,
176
- "learning_rate": 4.991321336097546e-06,
177
- "loss": 3.819,
178
- "step": 230
179
- },
180
- {
181
- "epoch": 6.666666666666667,
182
- "grad_norm": 97.10274505615234,
183
- "learning_rate": 4.989935734988098e-06,
184
- "loss": 3.2709,
185
- "step": 240
186
- },
187
- {
188
- "epoch": 6.944444444444445,
189
- "grad_norm": 107.55123138427734,
190
- "learning_rate": 4.988447782372996e-06,
191
- "loss": 3.0078,
192
- "step": 250
193
- },
194
- {
195
- "epoch": 7.222222222222222,
196
- "grad_norm": 102.92362213134766,
197
- "learning_rate": 4.986857539416144e-06,
198
- "loss": 3.3251,
199
- "step": 260
200
- },
201
- {
202
- "epoch": 7.5,
203
- "grad_norm": 83.20296478271484,
204
- "learning_rate": 4.985165071486201e-06,
205
- "loss": 3.669,
206
- "step": 270
207
- },
208
- {
209
- "epoch": 7.777777777777778,
210
- "grad_norm": 78.28851318359375,
211
- "learning_rate": 4.983370448153896e-06,
212
- "loss": 3.1768,
213
- "step": 280
214
- },
215
- {
216
- "epoch": 8.055555555555555,
217
- "grad_norm": 64.56950378417969,
218
- "learning_rate": 4.981473743189163e-06,
219
- "loss": 3.2936,
220
- "step": 290
221
- },
222
- {
223
- "epoch": 8.333333333333334,
224
- "grad_norm": 71.38610076904297,
225
- "learning_rate": 4.979475034558115e-06,
226
- "loss": 2.9679,
227
- "step": 300
228
- },
229
- {
230
- "epoch": 8.333333333333334,
231
- "eval_loss": 2.776277542114258,
232
- "eval_runtime": 0.3519,
233
- "eval_samples_per_second": 25.575,
234
- "eval_steps_per_second": 25.575,
235
- "step": 300
236
- },
237
- {
238
- "epoch": 8.61111111111111,
239
- "grad_norm": 70.84721374511719,
240
- "learning_rate": 4.977374404419838e-06,
241
- "loss": 3.8222,
242
- "step": 310
243
- },
244
- {
245
- "epoch": 8.88888888888889,
246
- "grad_norm": 57.08628845214844,
247
- "learning_rate": 4.9751719391230055e-06,
248
- "loss": 3.139,
249
- "step": 320
250
- },
251
- {
252
- "epoch": 9.166666666666666,
253
- "grad_norm": 114.66728973388672,
254
- "learning_rate": 4.9728677292023405e-06,
255
- "loss": 2.7926,
256
- "step": 330
257
- },
258
- {
259
- "epoch": 9.444444444444445,
260
- "grad_norm": 107.12257385253906,
261
- "learning_rate": 4.97046186937489e-06,
262
- "loss": 3.6297,
263
- "step": 340
264
- },
265
- {
266
- "epoch": 9.722222222222221,
267
- "grad_norm": 123.43621063232422,
268
- "learning_rate": 4.967954458536126e-06,
269
- "loss": 2.9921,
270
- "step": 350
271
- },
272
- {
273
- "epoch": 10.0,
274
- "grad_norm": 90.94454956054688,
275
- "learning_rate": 4.965345599755888e-06,
276
- "loss": 2.9652,
277
- "step": 360
278
- },
279
- {
280
- "epoch": 10.277777777777779,
281
- "grad_norm": 95.23873901367188,
282
- "learning_rate": 4.9626354002741424e-06,
283
- "loss": 3.0514,
284
- "step": 370
285
- },
286
- {
287
- "epoch": 10.555555555555555,
288
- "grad_norm": 59.19866943359375,
289
- "learning_rate": 4.959823971496575e-06,
290
- "loss": 2.7505,
291
- "step": 380
292
- },
293
- {
294
- "epoch": 10.833333333333334,
295
- "grad_norm": 86.36837768554688,
296
- "learning_rate": 4.95691142899001e-06,
297
- "loss": 2.9705,
298
- "step": 390
299
- },
300
- {
301
- "epoch": 11.11111111111111,
302
- "grad_norm": 78.96321868896484,
303
- "learning_rate": 4.953897892477664e-06,
304
- "loss": 3.1144,
305
- "step": 400
306
- },
307
- {
308
- "epoch": 11.38888888888889,
309
- "grad_norm": 53.98061752319336,
310
- "learning_rate": 4.950783485834218e-06,
311
- "loss": 2.5288,
312
- "step": 410
313
- },
314
- {
315
- "epoch": 11.666666666666666,
316
- "grad_norm": 113.72119140625,
317
- "learning_rate": 4.947568337080733e-06,
318
- "loss": 2.8438,
319
- "step": 420
320
- },
321
- {
322
- "epoch": 11.944444444444445,
323
- "grad_norm": 81.29522705078125,
324
- "learning_rate": 4.944252578379379e-06,
325
- "loss": 2.7956,
326
- "step": 430
327
- },
328
- {
329
- "epoch": 12.222222222222221,
330
- "grad_norm": 58.65493392944336,
331
- "learning_rate": 4.940836346028011e-06,
332
- "loss": 2.8808,
333
- "step": 440
334
- },
335
- {
336
- "epoch": 12.5,
337
- "grad_norm": 88.85804748535156,
338
- "learning_rate": 4.937319780454559e-06,
339
- "loss": 2.9837,
340
- "step": 450
341
- },
342
- {
343
- "epoch": 12.5,
344
- "eval_loss": 2.297574281692505,
345
- "eval_runtime": 0.3379,
346
- "eval_samples_per_second": 26.638,
347
- "eval_steps_per_second": 26.638,
348
- "step": 450
349
- },
350
- {
351
- "epoch": 12.777777777777779,
352
- "grad_norm": 154.9428253173828,
353
- "learning_rate": 4.933703026211262e-06,
354
- "loss": 2.4245,
355
- "step": 460
356
- },
357
- {
358
- "epoch": 13.055555555555555,
359
- "grad_norm": 60.50612258911133,
360
- "learning_rate": 4.92998623196872e-06,
361
- "loss": 2.5858,
362
- "step": 470
363
- },
364
- {
365
- "epoch": 13.333333333333334,
366
- "grad_norm": 65.40650177001953,
367
- "learning_rate": 4.926169550509787e-06,
368
- "loss": 2.865,
369
- "step": 480
370
- },
371
- {
372
- "epoch": 13.61111111111111,
373
- "grad_norm": 61.311580657958984,
374
- "learning_rate": 4.9222531387232885e-06,
375
- "loss": 2.3105,
376
- "step": 490
377
- },
378
- {
379
- "epoch": 13.88888888888889,
380
- "grad_norm": 132.292236328125,
381
- "learning_rate": 4.918237157597574e-06,
382
- "loss": 2.6459,
383
- "step": 500
384
- },
385
- {
386
- "epoch": 14.166666666666666,
387
- "grad_norm": 64.7785873413086,
388
- "learning_rate": 4.914121772213898e-06,
389
- "loss": 2.1178,
390
- "step": 510
391
- },
392
- {
393
- "epoch": 14.444444444444445,
394
- "grad_norm": 63.461647033691406,
395
- "learning_rate": 4.909907151739634e-06,
396
- "loss": 2.6851,
397
- "step": 520
398
- },
399
- {
400
- "epoch": 14.722222222222221,
401
- "grad_norm": 81.0846939086914,
402
- "learning_rate": 4.905593469421323e-06,
403
- "loss": 2.3994,
404
- "step": 530
405
- },
406
- {
407
- "epoch": 15.0,
408
- "grad_norm": 126.24242401123047,
409
- "learning_rate": 4.901180902577549e-06,
410
- "loss": 2.4261,
411
- "step": 540
412
- },
413
- {
414
- "epoch": 15.277777777777779,
415
- "grad_norm": 85.41973876953125,
416
- "learning_rate": 4.896669632591652e-06,
417
- "loss": 2.343,
418
- "step": 550
419
- },
420
- {
421
- "epoch": 15.555555555555555,
422
- "grad_norm": 109.64722442626953,
423
- "learning_rate": 4.892059844904273e-06,
424
- "loss": 2.3353,
425
- "step": 560
426
- },
427
- {
428
- "epoch": 15.833333333333334,
429
- "grad_norm": 78.56090545654297,
430
- "learning_rate": 4.8873517290057265e-06,
431
- "loss": 2.3844,
432
- "step": 570
433
- },
434
- {
435
- "epoch": 16.11111111111111,
436
- "grad_norm": 49.437007904052734,
437
- "learning_rate": 4.882545478428219e-06,
438
- "loss": 2.2314,
439
- "step": 580
440
- },
441
- {
442
- "epoch": 16.38888888888889,
443
- "grad_norm": 54.6522331237793,
444
- "learning_rate": 4.8776412907378845e-06,
445
- "loss": 2.3513,
446
- "step": 590
447
- },
448
- {
449
- "epoch": 16.666666666666668,
450
- "grad_norm": 67.71409606933594,
451
- "learning_rate": 4.872639367526672e-06,
452
- "loss": 2.6987,
453
- "step": 600
454
- },
455
- {
456
- "epoch": 16.666666666666668,
457
- "eval_loss": 1.923978567123413,
458
- "eval_runtime": 0.3404,
459
- "eval_samples_per_second": 26.437,
460
- "eval_steps_per_second": 26.437,
461
- "step": 600
462
- },
463
- {
464
- "epoch": 16.944444444444443,
465
- "grad_norm": 129.5966339111328,
466
- "learning_rate": 4.8675399144040535e-06,
467
- "loss": 1.8938,
468
- "step": 610
469
- },
470
- {
471
- "epoch": 17.22222222222222,
472
- "grad_norm": 131.9312744140625,
473
- "learning_rate": 4.862343140988573e-06,
474
- "loss": 2.3376,
475
- "step": 620
476
- },
477
- {
478
- "epoch": 17.5,
479
- "grad_norm": 74.60222625732422,
480
- "learning_rate": 4.857049260899233e-06,
481
- "loss": 1.6671,
482
- "step": 630
483
- },
484
- {
485
- "epoch": 17.77777777777778,
486
- "grad_norm": 71.88908386230469,
487
- "learning_rate": 4.851658491746707e-06,
488
- "loss": 2.4909,
489
- "step": 640
490
- },
491
- {
492
- "epoch": 18.055555555555557,
493
- "grad_norm": 61.891929626464844,
494
- "learning_rate": 4.846171055124401e-06,
495
- "loss": 2.1035,
496
- "step": 650
497
- },
498
- {
499
- "epoch": 18.333333333333332,
500
- "grad_norm": 92.33902740478516,
501
- "learning_rate": 4.8405871765993435e-06,
502
- "loss": 2.3135,
503
- "step": 660
504
- },
505
- {
506
- "epoch": 18.61111111111111,
507
- "grad_norm": 119.72765350341797,
508
- "learning_rate": 4.834907085702909e-06,
509
- "loss": 2.0914,
510
- "step": 670
511
- },
512
- {
513
- "epoch": 18.88888888888889,
514
- "grad_norm": 85.96342468261719,
515
- "learning_rate": 4.829131015921386e-06,
516
- "loss": 2.081,
517
- "step": 680
518
- },
519
- {
520
- "epoch": 19.166666666666668,
521
- "grad_norm": 43.483604431152344,
522
- "learning_rate": 4.82325920468638e-06,
523
- "loss": 2.0018,
524
- "step": 690
525
- },
526
- {
527
- "epoch": 19.444444444444443,
528
- "grad_norm": 58.90408706665039,
529
- "learning_rate": 4.817291893365055e-06,
530
- "loss": 2.0761,
531
- "step": 700
532
- },
533
- {
534
- "epoch": 19.72222222222222,
535
- "grad_norm": 36.50251007080078,
536
- "learning_rate": 4.811229327250204e-06,
537
- "loss": 1.8106,
538
- "step": 710
539
- },
540
- {
541
- "epoch": 20.0,
542
- "grad_norm": 165.16329956054688,
543
- "learning_rate": 4.805071755550177e-06,
544
- "loss": 2.0202,
545
- "step": 720
546
- },
547
- {
548
- "epoch": 20.27777777777778,
549
- "grad_norm": 59.47331619262695,
550
- "learning_rate": 4.7988194313786275e-06,
551
- "loss": 2.0095,
552
- "step": 730
553
- },
554
- {
555
- "epoch": 20.555555555555557,
556
- "grad_norm": 85.59427642822266,
557
- "learning_rate": 4.7924726117441135e-06,
558
- "loss": 1.9848,
559
- "step": 740
560
- },
561
- {
562
- "epoch": 20.833333333333332,
563
- "grad_norm": 135.71310424804688,
564
- "learning_rate": 4.786031557539532e-06,
565
- "loss": 1.8273,
566
- "step": 750
567
- },
568
- {
569
- "epoch": 20.833333333333332,
570
- "eval_loss": 1.7372846603393555,
571
- "eval_runtime": 0.3551,
572
- "eval_samples_per_second": 25.346,
573
- "eval_steps_per_second": 25.346,
574
- "step": 750
575
- },
576
- {
577
- "epoch": 21.11111111111111,
578
- "grad_norm": 54.28533935546875,
579
- "learning_rate": 4.779496533531393e-06,
580
- "loss": 1.9957,
581
- "step": 760
582
- },
583
- {
584
- "epoch": 21.38888888888889,
585
- "grad_norm": 61.805938720703125,
586
- "learning_rate": 4.7728678083489375e-06,
587
- "loss": 1.8208,
588
- "step": 770
589
- },
590
- {
591
- "epoch": 21.666666666666668,
592
- "grad_norm": 74.6706314086914,
593
- "learning_rate": 4.766145654473096e-06,
594
- "loss": 1.8079,
595
- "step": 780
596
- },
597
- {
598
- "epoch": 21.944444444444443,
599
- "grad_norm": 94.1037368774414,
600
- "learning_rate": 4.7593303482252835e-06,
601
- "loss": 1.9016,
602
- "step": 790
603
- },
604
- {
605
- "epoch": 22.22222222222222,
606
- "grad_norm": 52.785804748535156,
607
- "learning_rate": 4.752422169756048e-06,
608
- "loss": 1.4806,
609
- "step": 800
610
- },
611
- {
612
- "epoch": 22.5,
613
- "grad_norm": 137.42938232421875,
614
- "learning_rate": 4.745421403033548e-06,
615
- "loss": 2.0772,
616
- "step": 810
617
- },
618
- {
619
- "epoch": 22.77777777777778,
620
- "grad_norm": 49.996612548828125,
621
- "learning_rate": 4.738328335831883e-06,
622
- "loss": 1.9437,
623
- "step": 820
624
- },
625
- {
626
- "epoch": 23.055555555555557,
627
- "grad_norm": 237.96719360351562,
628
- "learning_rate": 4.7311432597192655e-06,
629
- "loss": 1.9811,
630
- "step": 830
631
- },
632
- {
633
- "epoch": 23.333333333333332,
634
- "grad_norm": 83.38140106201172,
635
- "learning_rate": 4.72386647004603e-06,
636
- "loss": 2.0359,
637
- "step": 840
638
- },
639
- {
640
- "epoch": 23.61111111111111,
641
- "grad_norm": 101.59205627441406,
642
- "learning_rate": 4.716498265932501e-06,
643
- "loss": 1.4225,
644
- "step": 850
645
- },
646
- {
647
- "epoch": 23.88888888888889,
648
- "grad_norm": 53.51150131225586,
649
- "learning_rate": 4.7090389502566884e-06,
650
- "loss": 1.6665,
651
- "step": 860
652
- },
653
- {
654
- "epoch": 24.166666666666668,
655
- "grad_norm": 61.94685363769531,
656
- "learning_rate": 4.701488829641845e-06,
657
- "loss": 1.7755,
658
- "step": 870
659
- },
660
- {
661
- "epoch": 24.444444444444443,
662
- "grad_norm": 144.9290008544922,
663
- "learning_rate": 4.693848214443858e-06,
664
- "loss": 1.8703,
665
- "step": 880
666
- },
667
- {
668
- "epoch": 24.72222222222222,
669
- "grad_norm": 66.09136962890625,
670
- "learning_rate": 4.686117418738489e-06,
671
- "loss": 1.8144,
672
- "step": 890
673
- },
674
- {
675
- "epoch": 25.0,
676
- "grad_norm": 42.97128677368164,
677
- "learning_rate": 4.678296760308474e-06,
678
- "loss": 1.4169,
679
- "step": 900
680
- },
681
- {
682
- "epoch": 25.0,
683
- "eval_loss": 1.650766372680664,
684
- "eval_runtime": 0.3362,
685
- "eval_samples_per_second": 26.77,
686
- "eval_steps_per_second": 26.77,
687
- "step": 900
688
- },
689
- {
690
- "epoch": 25.27777777777778,
691
- "grad_norm": 79.94445037841797,
692
- "learning_rate": 4.670386560630446e-06,
693
- "loss": 1.7837,
694
- "step": 910
695
- },
696
- {
697
- "epoch": 25.555555555555557,
698
- "grad_norm": 51.69401168823242,
699
- "learning_rate": 4.6623871448617345e-06,
700
- "loss": 1.6088,
701
- "step": 920
702
- },
703
- {
704
- "epoch": 25.833333333333332,
705
- "grad_norm": 57.889190673828125,
706
- "learning_rate": 4.654298841826988e-06,
707
- "loss": 1.6924,
708
- "step": 930
709
- },
710
- {
711
- "epoch": 26.11111111111111,
712
- "grad_norm": 105.6498794555664,
713
- "learning_rate": 4.646121984004666e-06,
714
- "loss": 1.5328,
715
- "step": 940
716
- },
717
- {
718
- "epoch": 26.38888888888889,
719
- "grad_norm": 40.925804138183594,
720
- "learning_rate": 4.637856907513366e-06,
721
- "loss": 1.4407,
722
- "step": 950
723
- },
724
- {
725
- "epoch": 26.666666666666668,
726
- "grad_norm": 95.5039291381836,
727
- "learning_rate": 4.629503952098011e-06,
728
- "loss": 1.7479,
729
- "step": 960
730
- },
731
- {
732
- "epoch": 26.944444444444443,
733
- "grad_norm": 82.50537872314453,
734
- "learning_rate": 4.621063461115882e-06,
735
- "loss": 1.4828,
736
- "step": 970
737
- },
738
- {
739
- "epoch": 27.22222222222222,
740
- "grad_norm": 58.20735168457031,
741
- "learning_rate": 4.612535781522504e-06,
742
- "loss": 1.3901,
743
- "step": 980
744
- },
745
- {
746
- "epoch": 27.5,
747
- "grad_norm": 103.87213897705078,
748
- "learning_rate": 4.6039212638573835e-06,
749
- "loss": 1.6103,
750
- "step": 990
751
- },
752
- {
753
- "epoch": 27.77777777777778,
754
- "grad_norm": 46.515586853027344,
755
- "learning_rate": 4.5952202622296015e-06,
756
- "loss": 1.6174,
757
- "step": 1000
758
- },
759
- {
760
- "epoch": 28.055555555555557,
761
- "grad_norm": 83.15587615966797,
762
- "learning_rate": 4.586433134303257e-06,
763
- "loss": 1.477,
764
- "step": 1010
765
- },
766
- {
767
- "epoch": 28.333333333333332,
768
- "grad_norm": 89.03128051757812,
769
- "learning_rate": 4.57756024128276e-06,
770
- "loss": 1.6058,
771
- "step": 1020
772
- },
773
- {
774
- "epoch": 28.61111111111111,
775
- "grad_norm": 53.300315856933594,
776
- "learning_rate": 4.5686019478979915e-06,
777
- "loss": 1.392,
778
- "step": 1030
779
- },
780
- {
781
- "epoch": 28.88888888888889,
782
- "grad_norm": 63.217716217041016,
783
- "learning_rate": 4.559558622389304e-06,
784
- "loss": 1.4695,
785
- "step": 1040
786
- },
787
- {
788
- "epoch": 29.166666666666668,
789
- "grad_norm": 38.29002380371094,
790
- "learning_rate": 4.55043063649239e-06,
791
- "loss": 1.3374,
792
- "step": 1050
793
- },
794
- {
795
- "epoch": 29.166666666666668,
796
- "eval_loss": 1.628559947013855,
797
- "eval_runtime": 0.3368,
798
- "eval_samples_per_second": 26.721,
799
- "eval_steps_per_second": 26.721,
800
- "step": 1050
801
  }
802
  ],
803
  "logging_steps": 10,
@@ -817,7 +139,7 @@
817
  "attributes": {}
818
  }
819
  },
820
- "total_flos": 131723662393344.0,
821
  "train_batch_size": 2,
822
  "trial_name": null,
823
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.45632824301719666,
3
+ "best_model_checkpoint": "./output/checkpoint-150",
4
+ "epoch": 4.166666666666667,
5
  "eval_steps": 150,
6
+ "global_step": 150,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.2777777777777778,
13
+ "grad_norm": 16.678560256958008,
14
+ "learning_rate": 1.8053203938204972e-06,
15
+ "loss": 0.4404,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.5555555555555556,
20
+ "grad_norm": 24.250707626342773,
21
+ "learning_rate": 3.6106407876409943e-06,
22
+ "loss": 0.4338,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.8333333333333334,
27
+ "grad_norm": 11.988567352294922,
28
+ "learning_rate": 5.4159611814614915e-06,
29
+ "loss": 0.5386,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 1.1111111111111112,
34
+ "grad_norm": 10.733481407165527,
35
+ "learning_rate": 7.221281575281989e-06,
36
+ "loss": 0.4805,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 1.3888888888888888,
41
+ "grad_norm": 10.659712791442871,
42
+ "learning_rate": 9.026601969102486e-06,
43
+ "loss": 0.4308,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 1.6666666666666665,
48
+ "grad_norm": 14.367152214050293,
49
+ "learning_rate": 1.0831922362922983e-05,
50
+ "loss": 0.4926,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 1.9444444444444444,
55
+ "grad_norm": 26.407075881958008,
56
+ "learning_rate": 1.263724275674348e-05,
57
+ "loss": 0.4749,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 2.2222222222222223,
62
+ "grad_norm": 16.550508499145508,
63
+ "learning_rate": 1.4442563150563977e-05,
64
+ "loss": 0.3476,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 2.5,
69
+ "grad_norm": 12.457433700561523,
70
+ "learning_rate": 1.6247883544384475e-05,
71
+ "loss": 0.4611,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 2.7777777777777777,
76
+ "grad_norm": 6.2512030601501465,
77
+ "learning_rate": 1.8053203938204972e-05,
78
+ "loss": 0.3172,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 3.0555555555555554,
83
+ "grad_norm": 12.345681190490723,
84
+ "learning_rate": 1.8053018414078887e-05,
85
+ "loss": 0.3607,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 3.3333333333333335,
90
+ "grad_norm": 10.975536346435547,
91
+ "learning_rate": 1.8052461849326804e-05,
92
+ "loss": 0.2315,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 3.611111111111111,
97
+ "grad_norm": 22.538253784179688,
98
+ "learning_rate": 1.8051534266826923e-05,
99
+ "loss": 0.2458,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 3.888888888888889,
104
+ "grad_norm": 9.7154541015625,
105
+ "learning_rate": 1.8050235704708523e-05,
106
+ "loss": 0.2693,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 4.166666666666667,
111
+ "grad_norm": 7.695099353790283,
112
+ "learning_rate": 1.8048566216350408e-05,
113
+ "loss": 0.2051,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 4.166666666666667,
118
+ "eval_loss": 0.45632824301719666,
119
+ "eval_runtime": 0.9401,
120
+ "eval_samples_per_second": 9.574,
121
+ "eval_steps_per_second": 9.574,
122
  "step": 150
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  }
124
  ],
125
  "logging_steps": 10,
 
139
  "attributes": {}
140
  }
141
  },
142
+ "total_flos": 1908279321477120.0,
143
  "train_batch_size": 2,
144
  "trial_name": null,
145
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:87daafd6e0abf7887fe4ba7916c751fc99e13e4a39fb4df26920bf9257b8000d
3
- size 5368
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2bdf1390d17124cf4f93eac9a9298192534d0bd0ab8b33fa14cac1a258bda108
3
+ size 5496