phinniaspp commited on
Commit
c9649cd
·
verified ·
1 Parent(s): f2ddd52

Training in progress, step 100, checkpoint

Browse files
last-checkpoint/README.md CHANGED
@@ -1,12 +1,6 @@
1
  ---
2
  base_model: mistralai/Mistral-7B-Instruct-v0.2
3
  library_name: peft
4
- pipeline_tag: text-generation
5
- tags:
6
- - base_model:adapter:mistralai/Mistral-7B-Instruct-v0.2
7
- - lora
8
- - sft
9
- - trl
10
  ---
11
 
12
  # Model Card for Model ID
@@ -205,4 +199,4 @@ Carbon emissions can be estimated using the [Machine Learning Impact calculator]
205
  [More Information Needed]
206
  ### Framework versions
207
 
208
- - PEFT 0.19.1
 
1
  ---
2
  base_model: mistralai/Mistral-7B-Instruct-v0.2
3
  library_name: peft
 
 
 
 
 
 
4
  ---
5
 
6
  # Model Card for Model ID
 
199
  [More Information Needed]
200
  ### Framework versions
201
 
202
+ - PEFT 0.13.0
last-checkpoint/adapter_config.json CHANGED
@@ -1,14 +1,8 @@
1
  {
2
- "alora_invocation_tokens": null,
3
  "alpha_pattern": {},
4
- "arrow_config": null,
5
  "auto_mapping": null,
6
- "base_model_name_or_path": null,
7
  "bias": "none",
8
- "corda_config": null,
9
- "ensure_weight_tying": false,
10
- "eva_config": null,
11
- "exclude_modules": null,
12
  "fan_in_fan_out": false,
13
  "inference_mode": true,
14
  "init_lora_weights": true,
@@ -17,32 +11,24 @@
17
  "layers_to_transform": null,
18
  "loftq_config": {},
19
  "lora_alpha": 32,
20
- "lora_bias": false,
21
  "lora_dropout": 0.05,
22
- "lora_ga_config": null,
23
  "megatron_config": null,
24
  "megatron_core": "megatron.core",
25
  "modules_to_save": null,
26
  "peft_type": "LORA",
27
- "peft_version": "0.19.1",
28
- "qalora_group_size": 16,
29
  "r": 16,
30
  "rank_pattern": {},
31
  "revision": null,
32
  "target_modules": [
33
- "q_proj",
 
34
  "gate_proj",
35
- "up_proj",
36
- "k_proj",
37
  "v_proj",
38
- "down_proj",
39
- "o_proj"
40
  ],
41
- "target_parameters": null,
42
  "task_type": "CAUSAL_LM",
43
- "trainable_token_indices": null,
44
- "use_bdlora": null,
45
  "use_dora": false,
46
- "use_qalora": false,
47
  "use_rslora": false
48
  }
 
1
  {
 
2
  "alpha_pattern": {},
 
3
  "auto_mapping": null,
4
+ "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.2",
5
  "bias": "none",
 
 
 
 
6
  "fan_in_fan_out": false,
7
  "inference_mode": true,
8
  "init_lora_weights": true,
 
11
  "layers_to_transform": null,
12
  "loftq_config": {},
13
  "lora_alpha": 32,
 
14
  "lora_dropout": 0.05,
 
15
  "megatron_config": null,
16
  "megatron_core": "megatron.core",
17
  "modules_to_save": null,
18
  "peft_type": "LORA",
 
 
19
  "r": 16,
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
+ "o_proj",
24
+ "down_proj",
25
  "gate_proj",
26
+ "q_proj",
 
27
  "v_proj",
28
+ "k_proj",
29
+ "up_proj"
30
  ],
 
31
  "task_type": "CAUSAL_LM",
 
 
32
  "use_dora": false,
 
33
  "use_rslora": false
34
  }
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e0c28ae9fda7ed077120f5397e44e06d2691b2ea2ada8010ba232ded303bcf15
3
- size 83953808
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9fe43da4e97eebeaa0158b36a25c885b64c459b8e04460cfe58ff057a1a9e33d
3
+ size 83945744
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3f9ebec1e37b3d5f04a61418f5f4316e15169a3a32f9b73632b74e8003b53ca8
3
- size 85733925
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:481d0894db7c13052290caab3e52d3e88de5e5cdf4ec83a8110888cb9d8eb75e
3
+ size 85728997
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:768b685cf09f245ca09014d59140a335cf62bc70a754dee586b97004fcdb04c8
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c489d94a3653edcd1381871cbb21ee09c7242857160be9a015191844079d14a
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:19476b10a067a1a5864bfeae7a86583ec2326a22d2ca0845e094959744d7ef6d
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:31dcc952e23cde9c32791f5eef02f3ae04c11b4696574b49b612390af96bc354
3
  size 1465
last-checkpoint/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
last-checkpoint/tokenizer.json CHANGED
@@ -1,6 +1,11 @@
1
  {
2
  "version": "1.0",
3
- "truncation": null,
 
 
 
 
 
4
  "padding": null,
5
  "added_tokens": [
6
  {
 
1
  {
2
  "version": "1.0",
3
+ "truncation": {
4
+ "direction": "Right",
5
+ "max_length": 1024,
6
+ "strategy": "LongestFirst",
7
+ "stride": 0
8
+ },
9
  "padding": null,
10
  "added_tokens": [
11
  {
last-checkpoint/tokenizer_config.json CHANGED
@@ -1,19 +1,45 @@
1
  {
 
 
2
  "add_prefix_space": null,
3
- "backend": "tokenizers",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  "bos_token": "<s>",
 
5
  "clean_up_tokenization_spaces": false,
6
  "eos_token": "</s>",
7
- "extra_special_tokens": [],
8
- "is_local": false,
9
  "legacy": false,
10
- "local_files_only": false,
11
  "model_max_length": 1000000000000000019884624838656,
12
  "pad_token": "</s>",
13
  "padding_side": "right",
14
  "sp_model_kwargs": {},
15
  "spaces_between_special_tokens": false,
16
- "tokenizer_class": "TokenizersBackend",
17
  "unk_token": "<unk>",
18
  "use_default_system_prompt": false
19
  }
 
1
  {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
  "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "additional_special_tokens": [],
32
  "bos_token": "<s>",
33
+ "chat_template": "{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content'] %}\n {%- set loop_messages = messages[1:] %}\n{%- else %}\n {%- set loop_messages = messages %}\n{%- endif %}\n\n{{- bos_token }}\n{%- for message in loop_messages %}\n {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}\n {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}\n {%- endif %}\n {%- if message['role'] == 'user' %}\n {%- if loop.first and system_message is defined %}\n {{- ' [INST] ' + system_message + '\\n\\n' + message['content'] + ' [/INST]' }}\n {%- else %}\n {{- ' [INST] ' + message['content'] + ' [/INST]' }}\n {%- endif %}\n {%- elif message['role'] == 'assistant' %}\n {{- ' ' + message['content'] + eos_token}}\n {%- else %}\n {{- raise_exception('Only user and assistant roles are supported, with the exception of an initial optional system message!') }}\n {%- endif %}\n{%- endfor %}\n",
34
  "clean_up_tokenization_spaces": false,
35
  "eos_token": "</s>",
 
 
36
  "legacy": false,
 
37
  "model_max_length": 1000000000000000019884624838656,
38
  "pad_token": "</s>",
39
  "padding_side": "right",
40
  "sp_model_kwargs": {},
41
  "spaces_between_special_tokens": false,
42
+ "tokenizer_class": "LlamaTokenizer",
43
  "unk_token": "<unk>",
44
  "use_default_system_prompt": false
45
  }
last-checkpoint/trainer_state.json CHANGED
@@ -1,425 +1,52 @@
1
  {
2
- "best_global_step": 800,
3
- "best_metric": 0.6767381429672241,
4
- "best_model_checkpoint": "./learnhub-checkpoints/checkpoint-800",
5
- "epoch": 0.6349836293283064,
6
  "eval_steps": 100,
7
- "global_step": 800,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "entropy": 1.5423119881749152,
14
  "epoch": 0.019843238416509576,
15
- "grad_norm": 2.125,
16
- "learning_rate": 4.8e-05,
17
- "loss": 2.1809950256347657,
18
- "mean_token_accuracy": 0.5977656890451908,
19
- "num_tokens": 42575.0,
20
  "step": 25
21
  },
22
  {
23
- "entropy": 1.1193951192498206,
24
  "epoch": 0.03968647683301915,
25
- "grad_norm": 1.671875,
26
- "learning_rate": 9.8e-05,
27
- "loss": 1.1683277893066406,
28
- "mean_token_accuracy": 0.7463910706341267,
29
- "num_tokens": 85122.0,
30
  "step": 50
31
  },
32
  {
33
- "entropy": 0.9514059691131115,
34
  "epoch": 0.05952971524952872,
35
- "grad_norm": 1.6328125,
36
- "learning_rate": 0.000148,
37
- "loss": 0.9270614624023438,
38
- "mean_token_accuracy": 0.7785248437523842,
39
- "num_tokens": 131421.0,
40
  "step": 75
41
  },
42
  {
43
- "entropy": 0.9425666096806526,
44
  "epoch": 0.0793729536660383,
45
- "grad_norm": 0.84765625,
46
- "learning_rate": 0.00019800000000000002,
47
- "loss": 0.9538876342773438,
48
- "mean_token_accuracy": 0.7819291327893734,
49
- "num_tokens": 176055.0,
50
  "step": 100
51
  },
52
  {
53
  "epoch": 0.0793729536660383,
54
- "eval_entropy": 0.9276722411988145,
55
- "eval_loss": 0.8972787261009216,
56
- "eval_mean_token_accuracy": 0.7821191083464469,
57
- "eval_num_tokens": 176055.0,
58
- "eval_runtime": 290.3105,
59
- "eval_samples_per_second": 3.655,
60
- "eval_steps_per_second": 1.829,
61
  "step": 100
62
- },
63
- {
64
- "entropy": 0.8993638175725936,
65
- "epoch": 0.09921619208254787,
66
- "grad_norm": 1.453125,
67
- "learning_rate": 0.00019997901149992398,
68
- "loss": 0.9062361145019531,
69
- "mean_token_accuracy": 0.7870219559967517,
70
- "num_tokens": 218572.0,
71
- "step": 125
72
- },
73
- {
74
- "entropy": 0.8412108091264963,
75
- "epoch": 0.11905943049905744,
76
- "grad_norm": 1.2578125,
77
- "learning_rate": 0.00019991252117460662,
78
- "loss": 0.8404283905029297,
79
- "mean_token_accuracy": 0.7994147537648678,
80
- "num_tokens": 264473.0,
81
- "step": 150
82
- },
83
- {
84
- "entropy": 0.8592079882323742,
85
- "epoch": 0.13890266891556702,
86
- "grad_norm": 1.34375,
87
- "learning_rate": 0.00019980052286930407,
88
- "loss": 0.8845314025878906,
89
- "mean_token_accuracy": 0.7938638082146645,
90
- "num_tokens": 308986.0,
91
- "step": 175
92
- },
93
- {
94
- "entropy": 0.8827662563323975,
95
- "epoch": 0.1587459073320766,
96
- "grad_norm": 1.4453125,
97
- "learning_rate": 0.000199643067596808,
98
- "loss": 0.9247197723388672,
99
- "mean_token_accuracy": 0.7900393509864807,
100
- "num_tokens": 353705.0,
101
- "step": 200
102
- },
103
- {
104
- "epoch": 0.1587459073320766,
105
- "eval_entropy": 0.8343640095545523,
106
- "eval_loss": 0.815721869468689,
107
- "eval_mean_token_accuracy": 0.7949616669935021,
108
- "eval_num_tokens": 353705.0,
109
- "eval_runtime": 290.3166,
110
- "eval_samples_per_second": 3.655,
111
- "eval_steps_per_second": 1.829,
112
- "step": 200
113
- },
114
- {
115
- "entropy": 0.8292603352665902,
116
- "epoch": 0.17858914574858617,
117
- "grad_norm": 1.3828125,
118
- "learning_rate": 0.00019944022707456992,
119
- "loss": 0.8381348419189453,
120
- "mean_token_accuracy": 0.8005190336704254,
121
- "num_tokens": 396464.0,
122
- "step": 225
123
- },
124
- {
125
- "entropy": 0.8279347644746303,
126
- "epoch": 0.19843238416509573,
127
- "grad_norm": 1.3671875,
128
- "learning_rate": 0.00019919209369203533,
129
- "loss": 0.8449878692626953,
130
- "mean_token_accuracy": 0.7975908493995667,
131
- "num_tokens": 443396.0,
132
- "step": 250
133
- },
134
- {
135
- "entropy": 0.847797994017601,
136
- "epoch": 0.21827562258160532,
137
- "grad_norm": 1.40625,
138
- "learning_rate": 0.0001988987804685623,
139
- "loss": 0.7986299896240234,
140
- "mean_token_accuracy": 0.7965799477696419,
141
- "num_tokens": 488198.0,
142
- "step": 275
143
- },
144
- {
145
- "entropy": 0.8216452768445015,
146
- "epoch": 0.23811886099811488,
147
- "grad_norm": 1.109375,
148
- "learning_rate": 0.00019856042100194356,
149
- "loss": 0.785276870727539,
150
- "mean_token_accuracy": 0.8020041480660438,
151
- "num_tokens": 535664.0,
152
- "step": 300
153
- },
154
- {
155
- "epoch": 0.23811886099811488,
156
- "eval_entropy": 0.8421865575254076,
157
- "eval_loss": 0.7751030921936035,
158
- "eval_mean_token_accuracy": 0.8036603897304858,
159
- "eval_num_tokens": 535664.0,
160
- "eval_runtime": 291.1861,
161
- "eval_samples_per_second": 3.644,
162
- "eval_steps_per_second": 1.824,
163
- "step": 300
164
- },
165
- {
166
- "entropy": 0.7873362612724304,
167
- "epoch": 0.25796209941462445,
168
- "grad_norm": 1.1640625,
169
- "learning_rate": 0.00019817716940755586,
170
- "loss": 0.7424690246582031,
171
- "mean_token_accuracy": 0.8078745475411415,
172
- "num_tokens": 581174.0,
173
- "step": 325
174
- },
175
- {
176
- "entropy": 0.7965878197550773,
177
- "epoch": 0.27780533783113404,
178
- "grad_norm": 1.46875,
179
- "learning_rate": 0.00019774920024816353,
180
- "loss": 0.7590773773193359,
181
- "mean_token_accuracy": 0.8090151616930962,
182
- "num_tokens": 623047.0,
183
- "step": 350
184
- },
185
- {
186
- "entropy": 0.7574200442433358,
187
- "epoch": 0.2976485762476436,
188
- "grad_norm": 1.3515625,
189
- "learning_rate": 0.00019727670845440893,
190
- "loss": 0.7333896636962891,
191
- "mean_token_accuracy": 0.8140154486894607,
192
- "num_tokens": 665332.0,
193
- "step": 375
194
- },
195
- {
196
- "entropy": 0.7607150036841631,
197
- "epoch": 0.3174918146641532,
198
- "grad_norm": 1.3671875,
199
- "learning_rate": 0.00019675990923602598,
200
- "loss": 0.7356826019287109,
201
- "mean_token_accuracy": 0.8127754744887352,
202
- "num_tokens": 709558.0,
203
- "step": 400
204
- },
205
- {
206
- "epoch": 0.3174918146641532,
207
- "eval_entropy": 0.7901154234865067,
208
- "eval_loss": 0.7394365072250366,
209
- "eval_mean_token_accuracy": 0.8106958746012097,
210
- "eval_num_tokens": 709558.0,
211
- "eval_runtime": 290.9403,
212
- "eval_samples_per_second": 3.647,
213
- "eval_steps_per_second": 1.825,
214
- "step": 400
215
- },
216
- {
217
- "entropy": 0.7346216081827879,
218
- "epoch": 0.33733505308066275,
219
- "grad_norm": 1.46875,
220
- "learning_rate": 0.0001961990379838167,
221
- "loss": 0.6551874542236328,
222
- "mean_token_accuracy": 0.8211761102080345,
223
- "num_tokens": 755393.0,
224
- "step": 425
225
- },
226
- {
227
- "entropy": 0.7677556264400482,
228
- "epoch": 0.35717829149717234,
229
- "grad_norm": 1.7421875,
230
- "learning_rate": 0.0001955943501624357,
231
- "loss": 0.7343796539306641,
232
- "mean_token_accuracy": 0.812620207965374,
233
- "num_tokens": 800020.0,
234
- "step": 450
235
- },
236
- {
237
- "entropy": 0.7691158069670201,
238
- "epoch": 0.37702152991368193,
239
- "grad_norm": 1.2734375,
240
- "learning_rate": 0.00019494612119403177,
241
- "loss": 0.7420792388916015,
242
- "mean_token_accuracy": 0.8088723468780518,
243
- "num_tokens": 842459.0,
244
- "step": 475
245
- },
246
- {
247
- "entropy": 0.7609671781212092,
248
- "epoch": 0.39686476833019146,
249
- "grad_norm": 1.28125,
250
- "learning_rate": 0.00019425464633279906,
251
- "loss": 0.733401870727539,
252
- "mean_token_accuracy": 0.8129055750370026,
253
- "num_tokens": 884620.0,
254
- "step": 500
255
- },
256
- {
257
- "epoch": 0.39686476833019146,
258
- "eval_entropy": 0.7718784739938609,
259
- "eval_loss": 0.7210129499435425,
260
- "eval_mean_token_accuracy": 0.8150942843063628,
261
- "eval_num_tokens": 884620.0,
262
- "eval_runtime": 290.0297,
263
- "eval_samples_per_second": 3.658,
264
- "eval_steps_per_second": 1.831,
265
- "step": 500
266
- },
267
- {
268
- "entropy": 0.7566675854474306,
269
- "epoch": 0.41670800674670105,
270
- "grad_norm": 1.359375,
271
- "learning_rate": 0.0001935202405304951,
272
- "loss": 0.7321968078613281,
273
- "mean_token_accuracy": 0.8134541392326355,
274
- "num_tokens": 929259.0,
275
- "step": 525
276
- },
277
- {
278
- "entropy": 0.7571731075644493,
279
- "epoch": 0.43655124516321064,
280
- "grad_norm": 1.640625,
281
- "learning_rate": 0.0001927432382929872,
282
- "loss": 0.7315776824951172,
283
- "mean_token_accuracy": 0.816252943277359,
284
- "num_tokens": 974531.0,
285
- "step": 550
286
- },
287
- {
288
- "entropy": 0.7301681135594845,
289
- "epoch": 0.45639448357972023,
290
- "grad_norm": 10.0625,
291
- "learning_rate": 0.00019192399352789232,
292
- "loss": 0.7157851409912109,
293
- "mean_token_accuracy": 0.8203792923688888,
294
- "num_tokens": 1019376.0,
295
- "step": 575
296
- },
297
- {
298
- "entropy": 0.7234990952163934,
299
- "epoch": 0.47623772199622977,
300
- "grad_norm": 1.5703125,
301
- "learning_rate": 0.00019106287938337984,
302
- "loss": 0.6780443572998047,
303
- "mean_token_accuracy": 0.818044265806675,
304
- "num_tokens": 1065482.0,
305
- "step": 600
306
- },
307
- {
308
- "epoch": 0.47623772199622977,
309
- "eval_entropy": 0.7563102317170683,
310
- "eval_loss": 0.7063755989074707,
311
- "eval_mean_token_accuracy": 0.8184236252150742,
312
- "eval_num_tokens": 1065482.0,
313
- "eval_runtime": 292.1942,
314
- "eval_samples_per_second": 3.631,
315
- "eval_steps_per_second": 1.817,
316
- "step": 600
317
- },
318
- {
319
- "entropy": 0.7022078443691134,
320
- "epoch": 0.49608096041273936,
321
- "grad_norm": 1.375,
322
- "learning_rate": 0.00019016028807821065,
323
- "loss": 0.65089599609375,
324
- "mean_token_accuracy": 0.8246574628353119,
325
- "num_tokens": 1113578.0,
326
- "step": 625
327
- },
328
- {
329
- "entropy": 0.7365065434575081,
330
- "epoch": 0.5159241988292489,
331
- "grad_norm": 1.2578125,
332
- "learning_rate": 0.00018921663072309007,
333
- "loss": 0.7072123718261719,
334
- "mean_token_accuracy": 0.8180231443047523,
335
- "num_tokens": 1158454.0,
336
- "step": 650
337
- },
338
- {
339
- "entropy": 0.6983346920460463,
340
- "epoch": 0.5357674372457585,
341
- "grad_norm": 1.0625,
342
- "learning_rate": 0.0001882323371334159,
343
- "loss": 0.6436396789550781,
344
- "mean_token_accuracy": 0.8268389776349068,
345
- "num_tokens": 1206571.0,
346
- "step": 675
347
- },
348
- {
349
- "entropy": 0.7212439847737551,
350
- "epoch": 0.5556106756622681,
351
- "grad_norm": 1.1796875,
352
- "learning_rate": 0.00018720785563350667,
353
- "loss": 0.6770188903808594,
354
- "mean_token_accuracy": 0.8182832631468773,
355
- "num_tokens": 1249976.0,
356
- "step": 700
357
- },
358
- {
359
- "epoch": 0.5556106756622681,
360
- "eval_entropy": 0.7183699383910767,
361
- "eval_loss": 0.6925566792488098,
362
- "eval_mean_token_accuracy": 0.8202201184130883,
363
- "eval_num_tokens": 1249976.0,
364
- "eval_runtime": 291.4532,
365
- "eval_samples_per_second": 3.64,
366
- "eval_steps_per_second": 1.822,
367
- "step": 700
368
- },
369
- {
370
- "entropy": 0.7322280709445477,
371
- "epoch": 0.5754539140787777,
372
- "grad_norm": 1.65625,
373
- "learning_rate": 0.0001861436528524,
374
- "loss": 0.7028071594238281,
375
- "mean_token_accuracy": 0.8174135899543762,
376
- "num_tokens": 1295378.0,
377
- "step": 725
378
- },
379
- {
380
- "entropy": 0.7073389308154583,
381
- "epoch": 0.5952971524952873,
382
- "grad_norm": 1.40625,
383
- "learning_rate": 0.00018504021351131296,
384
- "loss": 0.684170913696289,
385
- "mean_token_accuracy": 0.8211757111549377,
386
- "num_tokens": 1338843.0,
387
- "step": 750
388
- },
389
- {
390
- "entropy": 0.705704356059432,
391
- "epoch": 0.6151403909117968,
392
- "grad_norm": 1.4375,
393
- "learning_rate": 0.00018389804020286206,
394
- "loss": 0.6670877838134766,
395
- "mean_token_accuracy": 0.8235558214783668,
396
- "num_tokens": 1384327.0,
397
- "step": 775
398
- },
399
- {
400
- "entropy": 0.6878117263317108,
401
- "epoch": 0.6349836293283064,
402
- "grad_norm": 1.53125,
403
- "learning_rate": 0.00018271765316214356,
404
- "loss": 0.6584226989746094,
405
- "mean_token_accuracy": 0.8270227089524269,
406
- "num_tokens": 1427944.0,
407
- "step": 800
408
- },
409
- {
410
- "epoch": 0.6349836293283064,
411
- "eval_entropy": 0.6733546478712401,
412
- "eval_loss": 0.6767381429672241,
413
- "eval_mean_token_accuracy": 0.8240371050134216,
414
- "eval_num_tokens": 1427944.0,
415
- "eval_runtime": 290.3688,
416
- "eval_samples_per_second": 3.654,
417
- "eval_steps_per_second": 1.829,
418
- "step": 800
419
  }
420
  ],
421
  "logging_steps": 25,
422
- "max_steps": 3780,
423
  "num_input_tokens_seen": 0,
424
  "num_train_epochs": 3,
425
  "save_steps": 100,
@@ -435,7 +62,7 @@
435
  "attributes": {}
436
  }
437
  },
438
- "total_flos": 7.968632183621222e+16,
439
  "train_batch_size": 2,
440
  "trial_name": null,
441
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.8977340459823608,
3
+ "best_model_checkpoint": "./learnhub-checkpoints/checkpoint-100",
4
+ "epoch": 0.0793729536660383,
 
5
  "eval_steps": 100,
6
+ "global_step": 100,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
 
12
  "epoch": 0.019843238416509576,
13
+ "grad_norm": 18.390625,
14
+ "learning_rate": 5e-05,
15
+ "loss": 16.5658,
 
 
16
  "step": 25
17
  },
18
  {
 
19
  "epoch": 0.03968647683301915,
20
+ "grad_norm": 15.34375,
21
+ "learning_rate": 0.0001,
22
+ "loss": 9.0596,
 
 
23
  "step": 50
24
  },
25
  {
 
26
  "epoch": 0.05952971524952872,
27
+ "grad_norm": 12.78125,
28
+ "learning_rate": 0.00015000000000000001,
29
+ "loss": 7.4307,
 
 
30
  "step": 75
31
  },
32
  {
 
33
  "epoch": 0.0793729536660383,
34
+ "grad_norm": 9.1796875,
35
+ "learning_rate": 0.0002,
36
+ "loss": 7.2264,
 
 
37
  "step": 100
38
  },
39
  {
40
  "epoch": 0.0793729536660383,
41
+ "eval_loss": 0.8977340459823608,
42
+ "eval_runtime": 789.8478,
43
+ "eval_samples_per_second": 1.343,
44
+ "eval_steps_per_second": 0.672,
 
 
 
45
  "step": 100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  }
47
  ],
48
  "logging_steps": 25,
49
+ "max_steps": 3777,
50
  "num_input_tokens_seen": 0,
51
  "num_train_epochs": 3,
52
  "save_steps": 100,
 
62
  "attributes": {}
63
  }
64
  },
65
+ "total_flos": 9601168072753152.0,
66
  "train_batch_size": 2,
67
  "trial_name": null,
68
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a7636bfd15768feeae5b8b0ea6c6ea570e3bbf66f8df8000ffdf82d9d44941a2
3
- size 5777
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8306ac097284337a989527ee29aa84391a90dbcd0632818db692700704590264
3
+ size 5969