jeromeramos commited on
Commit
a99133b
·
verified ·
1 Parent(s): 27e0090

Model save

Browse files
README.md CHANGED
@@ -27,7 +27,7 @@ print(output["generated_text"])
27
 
28
  ## Training procedure
29
 
30
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/jerome-ramos-20/huggingface/runs/rdaw49f9)
31
 
32
 
33
  This model was trained with SFT.
 
27
 
28
  ## Training procedure
29
 
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/jerome-ramos-20/huggingface/runs/qm0bt6vo)
31
 
32
 
33
  This model was trained with SFT.
all_results.json CHANGED
@@ -5,10 +5,10 @@
5
  "eval_samples": 2071,
6
  "eval_samples_per_second": 86.635,
7
  "eval_steps_per_second": 2.725,
8
- "total_flos": 1.74045731487744e+18,
9
- "train_loss": 0.8234024724801822,
10
- "train_runtime": 2385.6161,
11
  "train_samples": 46269,
12
- "train_samples_per_second": 19.395,
13
- "train_steps_per_second": 0.151
14
  }
 
5
  "eval_samples": 2071,
6
  "eval_samples_per_second": 86.635,
7
  "eval_steps_per_second": 2.725,
8
+ "total_flos": 1.7115790489220547e+18,
9
+ "train_loss": 0.861595592175164,
10
+ "train_runtime": 2353.9448,
11
  "train_samples": 46269,
12
+ "train_samples_per_second": 19.656,
13
+ "train_steps_per_second": 0.153
14
  }
model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:96a3f25cdf50508cedb9141a645a1b95248e26d20ef5fe3d2de30857075f9ee2
3
  size 4977222960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f3646a85996025cdaed773e7201ee3e3320349d66731b2b77492ae1a5d14add
3
  size 4977222960
model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:675a20a1c8cb7ef8957fa7e3549f80d43b42e7bb023aa7c1a6c3b159e495bc67
3
  size 4999802720
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f2743e048959efde7f2379dd20a4fe9079ab98f6b125b32edd2f0c912d96d3e
3
  size 4999802720
model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bc06566be8c2f403026d162424f82153270a6a0d04b0b40e6e14ad4c2ea5332c
3
  size 4915916176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca317cebfba4e27a3e92f9eb2fd21f5695e0e5a514d71d78f5f2e240dd728ae2
3
  size 4915916176
model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2986a57de29725fc6544bc073fd543bfb7c0fe517bc6ef006cfebc4c12bbb8e5
3
  size 1168663096
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e888911d0c982b08e2ac6b084e7de4a4d500111b730a8ef9c8c43b1c4e83ad2
3
  size 1168663096
runs/Feb03_13-28-29_w-jerom-inter-play-sim-94c6890b9ccf44ea86f033a3db8a5dbd-84rg227/events.out.tfevents.1738589574.w-jerom-inter-play-sim-94c6890b9ccf44ea86f033a3db8a5dbd-84rg227.97164.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12bf47e94d4936149802c609d62553a93fc9ac6a80eaa55ea0dd4c4a86390310
3
+ size 18230
runs/Feb03_19-31-47_w-jerom-inter-play-sim-94c6890b9ccf44ea86f033a3db8a5dbd-6ckztwz/events.out.tfevents.1738611496.w-jerom-inter-play-sim-94c6890b9ccf44ea86f033a3db8a5dbd-6ckztwz.6260.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:598f7bfb6190b6462598690862205d1d99f7df666c9d5b8a3087f43e28244b69
3
+ size 21809
special_tokens_map.json CHANGED
@@ -1,4 +1,41 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "bos_token": {
3
  "content": "<|im_start|>",
4
  "lstrip": false,
 
1
  {
2
+ "additional_special_tokens": [
3
+ {
4
+ "content": "<response>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ {
11
+ "content": "</response>",
12
+ "lstrip": false,
13
+ "normalized": false,
14
+ "rstrip": false,
15
+ "single_word": false
16
+ },
17
+ {
18
+ "content": "<answer>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ {
25
+ "content": "</answer>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ },
31
+ {
32
+ "content": "<inquire>",
33
+ "lstrip": false,
34
+ "normalized": false,
35
+ "rstrip": false,
36
+ "single_word": false
37
+ }
38
+ ],
39
  "bos_token": {
40
  "content": "<|im_start|>",
41
  "lstrip": false,
tokenizer.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:635e16753749bb3465bdf9e00f68e8b29c9e4884d9ee55eb27705bd8f1318cf4
3
- size 17210395
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3919c1e7bfa558ff525a618a3d463929a238acaba668d7ef6da432fcd6cd7fad
3
+ size 17211327
tokenizer_config.json CHANGED
@@ -2063,8 +2063,55 @@
2063
  "rstrip": false,
2064
  "single_word": false,
2065
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2066
  }
2067
  },
 
 
 
 
 
 
 
2068
  "bos_token": "<|im_start|>",
2069
  "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
2070
  "clean_up_tokenization_spaces": true,
 
2063
  "rstrip": false,
2064
  "single_word": false,
2065
  "special": true
2066
+ },
2067
+ "128258": {
2068
+ "content": "<response>",
2069
+ "lstrip": false,
2070
+ "normalized": false,
2071
+ "rstrip": false,
2072
+ "single_word": false,
2073
+ "special": true
2074
+ },
2075
+ "128259": {
2076
+ "content": "</response>",
2077
+ "lstrip": false,
2078
+ "normalized": false,
2079
+ "rstrip": false,
2080
+ "single_word": false,
2081
+ "special": true
2082
+ },
2083
+ "128260": {
2084
+ "content": "<answer>",
2085
+ "lstrip": false,
2086
+ "normalized": false,
2087
+ "rstrip": false,
2088
+ "single_word": false,
2089
+ "special": true
2090
+ },
2091
+ "128261": {
2092
+ "content": "</answer>",
2093
+ "lstrip": false,
2094
+ "normalized": false,
2095
+ "rstrip": false,
2096
+ "single_word": false,
2097
+ "special": true
2098
+ },
2099
+ "128262": {
2100
+ "content": "<inquire>",
2101
+ "lstrip": false,
2102
+ "normalized": false,
2103
+ "rstrip": false,
2104
+ "single_word": false,
2105
+ "special": true
2106
  }
2107
  },
2108
+ "additional_special_tokens": [
2109
+ "<response>",
2110
+ "</response>",
2111
+ "<answer>",
2112
+ "</answer>",
2113
+ "<inquire>"
2114
+ ],
2115
  "bos_token": "<|im_start|>",
2116
  "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
2117
  "clean_up_tokenization_spaces": true,
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 0.9986168741355463,
3
- "total_flos": 1.74045731487744e+18,
4
- "train_loss": 0.8234024724801822,
5
- "train_runtime": 2385.6161,
6
  "train_samples": 46269,
7
- "train_samples_per_second": 19.395,
8
- "train_steps_per_second": 0.151
9
  }
 
1
  {
2
  "epoch": 0.9986168741355463,
3
+ "total_flos": 1.7115790489220547e+18,
4
+ "train_loss": 0.861595592175164,
5
+ "train_runtime": 2353.9448,
6
  "train_samples": 46269,
7
+ "train_samples_per_second": 19.656,
8
+ "train_steps_per_second": 0.153
9
  }
trainer_state.json CHANGED
@@ -10,531 +10,531 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.0027662517289073307,
13
- "grad_norm": 22.881450653076172,
14
  "learning_rate": 5.405405405405406e-06,
15
- "loss": 1.6158,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.013831258644536652,
20
- "grad_norm": 2.178889274597168,
21
  "learning_rate": 2.702702702702703e-05,
22
- "loss": 1.3807,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.027662517289073305,
27
- "grad_norm": 14.400589942932129,
28
  "learning_rate": 5.405405405405406e-05,
29
- "loss": 1.3352,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.04149377593360996,
34
- "grad_norm": 2.756945848464966,
35
  "learning_rate": 8.108108108108109e-05,
36
- "loss": 1.2203,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.05532503457814661,
41
- "grad_norm": 1.3922957181930542,
42
  "learning_rate": 0.00010810810810810812,
43
- "loss": 1.0964,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.06915629322268327,
48
- "grad_norm": 1.0261996984481812,
49
  "learning_rate": 0.00013513513513513514,
50
- "loss": 1.2033,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.08298755186721991,
55
- "grad_norm": 1.6099579334259033,
56
  "learning_rate": 0.00016216216216216218,
57
- "loss": 1.2005,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.09681881051175657,
62
- "grad_norm": 1.77192223072052,
63
  "learning_rate": 0.0001891891891891892,
64
- "loss": 1.4161,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.11065006915629322,
69
- "grad_norm": 1.0772837400436401,
70
  "learning_rate": 0.0001999576950082201,
71
- "loss": 1.4553,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 0.12448132780082988,
76
- "grad_norm": 1.4605121612548828,
77
  "learning_rate": 0.0001996992941167792,
78
- "loss": 1.2175,
79
  "step": 45
80
  },
81
  {
82
  "epoch": 0.13831258644536654,
83
- "grad_norm": 1.0822768211364746,
84
  "learning_rate": 0.00019920660160815422,
85
- "loss": 1.0378,
86
  "step": 50
87
  },
88
  {
89
  "epoch": 0.15214384508990317,
90
- "grad_norm": 0.9796843528747559,
91
  "learning_rate": 0.00019848077530122083,
92
- "loss": 1.0451,
93
  "step": 55
94
  },
95
  {
96
  "epoch": 0.16597510373443983,
97
- "grad_norm": 1.1945514678955078,
98
  "learning_rate": 0.00019752352087524933,
99
- "loss": 1.4266,
100
  "step": 60
101
  },
102
  {
103
  "epoch": 0.1798063623789765,
104
- "grad_norm": 0.8683685064315796,
105
  "learning_rate": 0.00019633708786158806,
106
- "loss": 1.0347,
107
  "step": 65
108
  },
109
  {
110
  "epoch": 0.19363762102351315,
111
- "grad_norm": 0.25568732619285583,
112
  "learning_rate": 0.0001949242643573034,
113
- "loss": 0.9376,
114
  "step": 70
115
  },
116
  {
117
  "epoch": 0.2074688796680498,
118
- "grad_norm": 0.26001420617103577,
119
  "learning_rate": 0.0001932883704732001,
120
- "loss": 0.9132,
121
  "step": 75
122
  },
123
  {
124
  "epoch": 0.22130013831258644,
125
- "grad_norm": 0.2598419189453125,
126
  "learning_rate": 0.00019143325053161796,
127
- "loss": 0.8958,
128
  "step": 80
129
  },
130
  {
131
  "epoch": 0.2351313969571231,
132
- "grad_norm": 0.20231448113918304,
133
  "learning_rate": 0.00018936326403234125,
134
- "loss": 0.8734,
135
  "step": 85
136
  },
137
  {
138
  "epoch": 0.24896265560165975,
139
- "grad_norm": 0.17383822798728943,
140
  "learning_rate": 0.00018708327540784922,
141
- "loss": 0.8701,
142
  "step": 90
143
  },
144
  {
145
  "epoch": 0.2627939142461964,
146
- "grad_norm": 0.17745399475097656,
147
  "learning_rate": 0.0001845986425919841,
148
- "loss": 0.8499,
149
  "step": 95
150
  },
151
  {
152
  "epoch": 0.2766251728907331,
153
- "grad_norm": 0.17801660299301147,
154
  "learning_rate": 0.0001819152044288992,
155
- "loss": 0.8512,
156
  "step": 100
157
  },
158
  {
159
  "epoch": 0.29045643153526973,
160
- "grad_norm": 0.18566825985908508,
161
  "learning_rate": 0.00017903926695187595,
162
- "loss": 0.8361,
163
  "step": 105
164
  },
165
  {
166
  "epoch": 0.30428769017980634,
167
- "grad_norm": 0.18012060225009918,
168
  "learning_rate": 0.00017597758856425494,
169
- "loss": 0.834,
170
  "step": 110
171
  },
172
  {
173
  "epoch": 0.318118948824343,
174
- "grad_norm": 0.16151954233646393,
175
  "learning_rate": 0.00017273736415730488,
176
- "loss": 0.8114,
177
  "step": 115
178
  },
179
  {
180
  "epoch": 0.33195020746887965,
181
- "grad_norm": 0.16563855111598969,
182
  "learning_rate": 0.00016932620820235244,
183
- "loss": 0.8191,
184
  "step": 120
185
  },
186
  {
187
  "epoch": 0.3457814661134163,
188
- "grad_norm": 0.16186057031154633,
189
  "learning_rate": 0.0001657521368569064,
190
- "loss": 0.7887,
191
  "step": 125
192
  },
193
  {
194
  "epoch": 0.359612724757953,
195
- "grad_norm": 0.1734704077243805,
196
  "learning_rate": 0.000162023549126826,
197
- "loss": 0.7946,
198
  "step": 130
199
  },
200
  {
201
  "epoch": 0.37344398340248963,
202
- "grad_norm": 0.17336814105510712,
203
  "learning_rate": 0.00015814920712880267,
204
- "loss": 0.7974,
205
  "step": 135
206
  },
207
  {
208
  "epoch": 0.3872752420470263,
209
- "grad_norm": 0.15509486198425293,
210
  "learning_rate": 0.00015413821549953698,
211
- "loss": 0.7866,
212
  "step": 140
213
  },
214
  {
215
  "epoch": 0.40110650069156295,
216
- "grad_norm": 0.18101590871810913,
217
  "learning_rate": 0.00015000000000000001,
218
- "loss": 0.7927,
219
  "step": 145
220
  },
221
  {
222
  "epoch": 0.4149377593360996,
223
- "grad_norm": 0.14941518008708954,
224
  "learning_rate": 0.0001457442853650581,
225
- "loss": 0.7698,
226
  "step": 150
227
  },
228
  {
229
  "epoch": 0.4287690179806362,
230
- "grad_norm": 0.15677104890346527,
231
  "learning_rate": 0.00014138107245051392,
232
- "loss": 0.7721,
233
  "step": 155
234
  },
235
  {
236
  "epoch": 0.4426002766251729,
237
- "grad_norm": 0.14607611298561096,
238
  "learning_rate": 0.00013692061473126845,
239
- "loss": 0.7516,
240
  "step": 160
241
  },
242
  {
243
  "epoch": 0.45643153526970953,
244
- "grad_norm": 0.16472630202770233,
245
  "learning_rate": 0.00013237339420583212,
246
- "loss": 0.7554,
247
  "step": 165
248
  },
249
  {
250
  "epoch": 0.4702627939142462,
251
- "grad_norm": 0.13666489720344543,
252
  "learning_rate": 0.00012775009676380957,
253
- "loss": 0.7515,
254
  "step": 170
255
  },
256
  {
257
  "epoch": 0.48409405255878285,
258
- "grad_norm": 0.1362183392047882,
259
  "learning_rate": 0.00012306158707424403,
260
- "loss": 0.7513,
261
  "step": 175
262
  },
263
  {
264
  "epoch": 0.4979253112033195,
265
- "grad_norm": 0.12810291349887848,
266
  "learning_rate": 0.00011831888305383268,
267
- "loss": 0.7385,
268
  "step": 180
269
  },
270
  {
271
  "epoch": 0.5117565698478561,
272
- "grad_norm": 0.14311975240707397,
273
  "learning_rate": 0.00011353312997501313,
274
- "loss": 0.7469,
275
  "step": 185
276
  },
277
  {
278
  "epoch": 0.5255878284923928,
279
- "grad_norm": 0.129547581076622,
280
  "learning_rate": 0.00010871557427476583,
281
- "loss": 0.7423,
282
  "step": 190
283
  },
284
  {
285
  "epoch": 0.5394190871369294,
286
- "grad_norm": 0.1447523832321167,
287
  "learning_rate": 0.0001038775371256817,
288
- "loss": 0.7351,
289
  "step": 195
290
  },
291
  {
292
  "epoch": 0.5532503457814661,
293
- "grad_norm": 0.1369813233613968,
294
  "learning_rate": 9.903038783140216e-05,
295
- "loss": 0.7202,
296
  "step": 200
297
  },
298
  {
299
  "epoch": 0.5670816044260027,
300
- "grad_norm": 0.12533989548683167,
301
  "learning_rate": 9.418551710895243e-05,
302
- "loss": 0.722,
303
  "step": 205
304
  },
305
  {
306
  "epoch": 0.5809128630705395,
307
- "grad_norm": 0.12739399075508118,
308
  "learning_rate": 8.935431032075318e-05,
309
- "loss": 0.7173,
310
  "step": 210
311
  },
312
  {
313
  "epoch": 0.5947441217150761,
314
- "grad_norm": 0.13596710562705994,
315
  "learning_rate": 8.454812071921596e-05,
316
- "loss": 0.7194,
317
  "step": 215
318
  },
319
  {
320
  "epoch": 0.6085753803596127,
321
- "grad_norm": 0.12327581644058228,
322
  "learning_rate": 7.977824276679623e-05,
323
- "loss": 0.7095,
324
  "step": 220
325
  },
326
  {
327
  "epoch": 0.6224066390041494,
328
- "grad_norm": 0.1317676603794098,
329
  "learning_rate": 7.505588559420189e-05,
330
- "loss": 0.713,
331
  "step": 225
332
  },
333
  {
334
  "epoch": 0.636237897648686,
335
- "grad_norm": 0.13516183197498322,
336
  "learning_rate": 7.039214665913003e-05,
337
- "loss": 0.7048,
338
  "step": 230
339
  },
340
  {
341
  "epoch": 0.6500691562932227,
342
- "grad_norm": 0.12717784941196442,
343
  "learning_rate": 6.579798566743314e-05,
344
- "loss": 0.7088,
345
  "step": 235
346
  },
347
  {
348
  "epoch": 0.6639004149377593,
349
- "grad_norm": 0.12097220122814178,
350
  "learning_rate": 6.128419881799996e-05,
351
- "loss": 0.6939,
352
  "step": 240
353
  },
354
  {
355
  "epoch": 0.677731673582296,
356
- "grad_norm": 0.1216357946395874,
357
  "learning_rate": 5.6861393431874675e-05,
358
- "loss": 0.6943,
359
  "step": 245
360
  },
361
  {
362
  "epoch": 0.6915629322268326,
363
- "grad_norm": 0.12578962743282318,
364
  "learning_rate": 5.253996302523596e-05,
365
- "loss": 0.6832,
366
  "step": 250
367
  },
368
  {
369
  "epoch": 0.7053941908713693,
370
- "grad_norm": 0.1288958042860031,
371
  "learning_rate": 4.833006288481371e-05,
372
- "loss": 0.6786,
373
  "step": 255
374
  },
375
  {
376
  "epoch": 0.719225449515906,
377
- "grad_norm": 0.13444924354553223,
378
  "learning_rate": 4.424158620314073e-05,
379
- "loss": 0.6861,
380
  "step": 260
381
  },
382
  {
383
  "epoch": 0.7330567081604425,
384
- "grad_norm": 0.15658161044120789,
385
  "learning_rate": 4.028414082972141e-05,
386
- "loss": 0.6829,
387
  "step": 265
388
  },
389
  {
390
  "epoch": 0.7468879668049793,
391
- "grad_norm": 0.13638462126255035,
392
  "learning_rate": 3.646702669275151e-05,
393
- "loss": 0.6811,
394
  "step": 270
395
  },
396
  {
397
  "epoch": 0.7607192254495159,
398
- "grad_norm": 0.11960398405790329,
399
  "learning_rate": 3.279921394444776e-05,
400
- "loss": 0.6645,
401
  "step": 275
402
  },
403
  {
404
  "epoch": 0.7745504840940526,
405
- "grad_norm": 0.12005037814378738,
406
  "learning_rate": 2.9289321881345254e-05,
407
- "loss": 0.6709,
408
  "step": 280
409
  },
410
  {
411
  "epoch": 0.7883817427385892,
412
- "grad_norm": 0.12300828844308853,
413
  "learning_rate": 2.594559868909956e-05,
414
- "loss": 0.6629,
415
  "step": 285
416
  },
417
  {
418
  "epoch": 0.8022130013831259,
419
- "grad_norm": 0.11922738701105118,
420
  "learning_rate": 2.2775902059393085e-05,
421
- "loss": 0.6613,
422
  "step": 290
423
  },
424
  {
425
  "epoch": 0.8160442600276625,
426
- "grad_norm": 0.11143971979618073,
427
  "learning_rate": 1.9787680724495617e-05,
428
- "loss": 0.6546,
429
  "step": 295
430
  },
431
  {
432
  "epoch": 0.8298755186721992,
433
- "grad_norm": 0.11601640284061432,
434
  "learning_rate": 1.698795695287212e-05,
435
- "loss": 0.6567,
436
  "step": 300
437
  },
438
  {
439
  "epoch": 0.8437067773167358,
440
- "grad_norm": 0.11989685148000717,
441
  "learning_rate": 1.4383310046973365e-05,
442
- "loss": 0.657,
443
  "step": 305
444
  },
445
  {
446
  "epoch": 0.8575380359612724,
447
- "grad_norm": 0.11077902466058731,
448
  "learning_rate": 1.1979860881988902e-05,
449
- "loss": 0.6555,
450
  "step": 310
451
  },
452
  {
453
  "epoch": 0.8713692946058091,
454
- "grad_norm": 0.11324643343687057,
455
  "learning_rate": 9.783257521896227e-06,
456
- "loss": 0.6468,
457
  "step": 315
458
  },
459
  {
460
  "epoch": 0.8852005532503457,
461
- "grad_norm": 0.11370333284139633,
462
  "learning_rate": 7.798661946608166e-06,
463
- "loss": 0.648,
464
  "step": 320
465
  },
466
  {
467
  "epoch": 0.8990318118948825,
468
- "grad_norm": 0.10991474986076355,
469
  "learning_rate": 6.030737921409169e-06,
470
- "loss": 0.6446,
471
  "step": 325
472
  },
473
  {
474
  "epoch": 0.9128630705394191,
475
- "grad_norm": 0.11461606621742249,
476
  "learning_rate": 4.4836400371876974e-06,
477
- "loss": 0.6387,
478
  "step": 330
479
  },
480
  {
481
  "epoch": 0.9266943291839558,
482
- "grad_norm": 0.1137213185429573,
483
  "learning_rate": 3.161003947219421e-06,
484
- "loss": 0.6329,
485
  "step": 335
486
  },
487
  {
488
  "epoch": 0.9405255878284924,
489
- "grad_norm": 0.10857342928647995,
490
  "learning_rate": 2.0659378234448525e-06,
491
- "loss": 0.6627,
492
  "step": 340
493
  },
494
  {
495
  "epoch": 0.9543568464730291,
496
- "grad_norm": 0.10978103429079056,
497
  "learning_rate": 1.201015052319099e-06,
498
- "loss": 0.6435,
499
  "step": 345
500
  },
501
  {
502
  "epoch": 0.9681881051175657,
503
- "grad_norm": 0.1058996319770813,
504
  "learning_rate": 5.682681873981577e-07,
505
- "loss": 0.6388,
506
  "step": 350
507
  },
508
  {
509
  "epoch": 0.9820193637621023,
510
- "grad_norm": 0.10548459738492966,
511
  "learning_rate": 1.6918417287318245e-07,
512
- "loss": 0.6382,
513
  "step": 355
514
  },
515
  {
516
  "epoch": 0.995850622406639,
517
- "grad_norm": 0.11099706590175629,
518
  "learning_rate": 4.700849277383679e-09,
519
- "loss": 0.6424,
520
  "step": 360
521
  },
522
  {
523
  "epoch": 0.9986168741355463,
524
- "eval_loss": 0.658014178276062,
525
- "eval_runtime": 53.9504,
526
- "eval_samples_per_second": 85.449,
527
- "eval_steps_per_second": 2.688,
528
  "step": 361
529
  },
530
  {
531
  "epoch": 0.9986168741355463,
532
  "step": 361,
533
- "total_flos": 1.74045731487744e+18,
534
- "train_loss": 0.8234024724801822,
535
- "train_runtime": 2385.6161,
536
- "train_samples_per_second": 19.395,
537
- "train_steps_per_second": 0.151
538
  }
539
  ],
540
  "logging_steps": 5,
@@ -554,7 +554,7 @@
554
  "attributes": {}
555
  }
556
  },
557
- "total_flos": 1.74045731487744e+18,
558
  "train_batch_size": 4,
559
  "trial_name": null,
560
  "trial_params": null
 
10
  "log_history": [
11
  {
12
  "epoch": 0.0027662517289073307,
13
+ "grad_norm": 22.1127872467041,
14
  "learning_rate": 5.405405405405406e-06,
15
+ "loss": 2.6011,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.013831258644536652,
20
+ "grad_norm": 3.526327610015869,
21
  "learning_rate": 2.702702702702703e-05,
22
+ "loss": 2.2001,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.027662517289073305,
27
+ "grad_norm": 2.0694353580474854,
28
  "learning_rate": 5.405405405405406e-05,
29
+ "loss": 1.8786,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.04149377593360996,
34
+ "grad_norm": 1.429513931274414,
35
  "learning_rate": 8.108108108108109e-05,
36
+ "loss": 1.6509,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.05532503457814661,
41
+ "grad_norm": 3.6957762241363525,
42
  "learning_rate": 0.00010810810810810812,
43
+ "loss": 1.4395,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.06915629322268327,
48
+ "grad_norm": 3.5924487113952637,
49
  "learning_rate": 0.00013513513513513514,
50
+ "loss": 1.1714,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.08298755186721991,
55
+ "grad_norm": 1.092515468597412,
56
  "learning_rate": 0.00016216216216216218,
57
+ "loss": 1.2197,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.09681881051175657,
62
+ "grad_norm": 4.442113876342773,
63
  "learning_rate": 0.0001891891891891892,
64
+ "loss": 1.204,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.11065006915629322,
69
+ "grad_norm": 6.686959266662598,
70
  "learning_rate": 0.0001999576950082201,
71
+ "loss": 1.6501,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 0.12448132780082988,
76
+ "grad_norm": 4.45343017578125,
77
  "learning_rate": 0.0001996992941167792,
78
+ "loss": 1.4183,
79
  "step": 45
80
  },
81
  {
82
  "epoch": 0.13831258644536654,
83
+ "grad_norm": 5.694210052490234,
84
  "learning_rate": 0.00019920660160815422,
85
+ "loss": 1.5559,
86
  "step": 50
87
  },
88
  {
89
  "epoch": 0.15214384508990317,
90
+ "grad_norm": 2.5626814365386963,
91
  "learning_rate": 0.00019848077530122083,
92
+ "loss": 1.2451,
93
  "step": 55
94
  },
95
  {
96
  "epoch": 0.16597510373443983,
97
+ "grad_norm": 0.5388926863670349,
98
  "learning_rate": 0.00019752352087524933,
99
+ "loss": 1.0484,
100
  "step": 60
101
  },
102
  {
103
  "epoch": 0.1798063623789765,
104
+ "grad_norm": 0.3036655783653259,
105
  "learning_rate": 0.00019633708786158806,
106
+ "loss": 0.9605,
107
  "step": 65
108
  },
109
  {
110
  "epoch": 0.19363762102351315,
111
+ "grad_norm": 0.25516265630722046,
112
  "learning_rate": 0.0001949242643573034,
113
+ "loss": 0.9121,
114
  "step": 70
115
  },
116
  {
117
  "epoch": 0.2074688796680498,
118
+ "grad_norm": 0.22290439903736115,
119
  "learning_rate": 0.0001932883704732001,
120
+ "loss": 0.9072,
121
  "step": 75
122
  },
123
  {
124
  "epoch": 0.22130013831258644,
125
+ "grad_norm": 0.23729199171066284,
126
  "learning_rate": 0.00019143325053161796,
127
+ "loss": 0.8938,
128
  "step": 80
129
  },
130
  {
131
  "epoch": 0.2351313969571231,
132
+ "grad_norm": 0.2355058640241623,
133
  "learning_rate": 0.00018936326403234125,
134
+ "loss": 0.8759,
135
  "step": 85
136
  },
137
  {
138
  "epoch": 0.24896265560165975,
139
+ "grad_norm": 0.20683090388774872,
140
  "learning_rate": 0.00018708327540784922,
141
+ "loss": 0.8758,
142
  "step": 90
143
  },
144
  {
145
  "epoch": 0.2627939142461964,
146
+ "grad_norm": 0.21719329059123993,
147
  "learning_rate": 0.0001845986425919841,
148
+ "loss": 0.8571,
149
  "step": 95
150
  },
151
  {
152
  "epoch": 0.2766251728907331,
153
+ "grad_norm": 0.20208917558193207,
154
  "learning_rate": 0.0001819152044288992,
155
+ "loss": 0.859,
156
  "step": 100
157
  },
158
  {
159
  "epoch": 0.29045643153526973,
160
+ "grad_norm": 0.18826699256896973,
161
  "learning_rate": 0.00017903926695187595,
162
+ "loss": 0.8427,
163
  "step": 105
164
  },
165
  {
166
  "epoch": 0.30428769017980634,
167
+ "grad_norm": 0.18175852298736572,
168
  "learning_rate": 0.00017597758856425494,
169
+ "loss": 0.8389,
170
  "step": 110
171
  },
172
  {
173
  "epoch": 0.318118948824343,
174
+ "grad_norm": 0.17405715584754944,
175
  "learning_rate": 0.00017273736415730488,
176
+ "loss": 0.8185,
177
  "step": 115
178
  },
179
  {
180
  "epoch": 0.33195020746887965,
181
+ "grad_norm": 0.15530933439731598,
182
  "learning_rate": 0.00016932620820235244,
183
+ "loss": 0.8249,
184
  "step": 120
185
  },
186
  {
187
  "epoch": 0.3457814661134163,
188
+ "grad_norm": 0.17757271230220795,
189
  "learning_rate": 0.0001657521368569064,
190
+ "loss": 0.7947,
191
  "step": 125
192
  },
193
  {
194
  "epoch": 0.359612724757953,
195
+ "grad_norm": 0.18264907598495483,
196
  "learning_rate": 0.000162023549126826,
197
+ "loss": 0.8021,
198
  "step": 130
199
  },
200
  {
201
  "epoch": 0.37344398340248963,
202
+ "grad_norm": 0.18304209411144257,
203
  "learning_rate": 0.00015814920712880267,
204
+ "loss": 0.8039,
205
  "step": 135
206
  },
207
  {
208
  "epoch": 0.3872752420470263,
209
+ "grad_norm": 0.16061393916606903,
210
  "learning_rate": 0.00015413821549953698,
211
+ "loss": 0.792,
212
  "step": 140
213
  },
214
  {
215
  "epoch": 0.40110650069156295,
216
+ "grad_norm": 0.1555311381816864,
217
  "learning_rate": 0.00015000000000000001,
218
+ "loss": 0.7948,
219
  "step": 145
220
  },
221
  {
222
  "epoch": 0.4149377593360996,
223
+ "grad_norm": 0.15761056542396545,
224
  "learning_rate": 0.0001457442853650581,
225
+ "loss": 0.7768,
226
  "step": 150
227
  },
228
  {
229
  "epoch": 0.4287690179806362,
230
+ "grad_norm": 0.1716078668832779,
231
  "learning_rate": 0.00014138107245051392,
232
+ "loss": 0.7758,
233
  "step": 155
234
  },
235
  {
236
  "epoch": 0.4426002766251729,
237
+ "grad_norm": 0.1470308154821396,
238
  "learning_rate": 0.00013692061473126845,
239
+ "loss": 0.7578,
240
  "step": 160
241
  },
242
  {
243
  "epoch": 0.45643153526970953,
244
+ "grad_norm": 0.15690156817436218,
245
  "learning_rate": 0.00013237339420583212,
246
+ "loss": 0.7619,
247
  "step": 165
248
  },
249
  {
250
  "epoch": 0.4702627939142462,
251
+ "grad_norm": 0.17660725116729736,
252
  "learning_rate": 0.00012775009676380957,
253
+ "loss": 0.7567,
254
  "step": 170
255
  },
256
  {
257
  "epoch": 0.48409405255878285,
258
+ "grad_norm": 0.13694822788238525,
259
  "learning_rate": 0.00012306158707424403,
260
+ "loss": 0.7569,
261
  "step": 175
262
  },
263
  {
264
  "epoch": 0.4979253112033195,
265
+ "grad_norm": 0.12447214871644974,
266
  "learning_rate": 0.00011831888305383268,
267
+ "loss": 0.7414,
268
  "step": 180
269
  },
270
  {
271
  "epoch": 0.5117565698478561,
272
+ "grad_norm": 0.13208778202533722,
273
  "learning_rate": 0.00011353312997501313,
274
+ "loss": 0.7495,
275
  "step": 185
276
  },
277
  {
278
  "epoch": 0.5255878284923928,
279
+ "grad_norm": 0.13374905288219452,
280
  "learning_rate": 0.00010871557427476583,
281
+ "loss": 0.7467,
282
  "step": 190
283
  },
284
  {
285
  "epoch": 0.5394190871369294,
286
+ "grad_norm": 0.14392955601215363,
287
  "learning_rate": 0.0001038775371256817,
288
+ "loss": 0.7388,
289
  "step": 195
290
  },
291
  {
292
  "epoch": 0.5532503457814661,
293
+ "grad_norm": 0.13033545017242432,
294
  "learning_rate": 9.903038783140216e-05,
295
+ "loss": 0.7239,
296
  "step": 200
297
  },
298
  {
299
  "epoch": 0.5670816044260027,
300
+ "grad_norm": 0.12652400135993958,
301
  "learning_rate": 9.418551710895243e-05,
302
+ "loss": 0.7251,
303
  "step": 205
304
  },
305
  {
306
  "epoch": 0.5809128630705395,
307
+ "grad_norm": 0.12813538312911987,
308
  "learning_rate": 8.935431032075318e-05,
309
+ "loss": 0.7206,
310
  "step": 210
311
  },
312
  {
313
  "epoch": 0.5947441217150761,
314
+ "grad_norm": 0.13136501610279083,
315
  "learning_rate": 8.454812071921596e-05,
316
+ "loss": 0.721,
317
  "step": 215
318
  },
319
  {
320
  "epoch": 0.6085753803596127,
321
+ "grad_norm": 0.13638000190258026,
322
  "learning_rate": 7.977824276679623e-05,
323
+ "loss": 0.7134,
324
  "step": 220
325
  },
326
  {
327
  "epoch": 0.6224066390041494,
328
+ "grad_norm": 0.13380198180675507,
329
  "learning_rate": 7.505588559420189e-05,
330
+ "loss": 0.7158,
331
  "step": 225
332
  },
333
  {
334
  "epoch": 0.636237897648686,
335
+ "grad_norm": 0.13291427493095398,
336
  "learning_rate": 7.039214665913003e-05,
337
+ "loss": 0.7068,
338
  "step": 230
339
  },
340
  {
341
  "epoch": 0.6500691562932227,
342
+ "grad_norm": 0.12505605816841125,
343
  "learning_rate": 6.579798566743314e-05,
344
+ "loss": 0.7109,
345
  "step": 235
346
  },
347
  {
348
  "epoch": 0.6639004149377593,
349
+ "grad_norm": 0.11483744531869888,
350
  "learning_rate": 6.128419881799996e-05,
351
+ "loss": 0.6962,
352
  "step": 240
353
  },
354
  {
355
  "epoch": 0.677731673582296,
356
+ "grad_norm": 0.1254301220178604,
357
  "learning_rate": 5.6861393431874675e-05,
358
+ "loss": 0.6944,
359
  "step": 245
360
  },
361
  {
362
  "epoch": 0.6915629322268326,
363
+ "grad_norm": 0.13567984104156494,
364
  "learning_rate": 5.253996302523596e-05,
365
+ "loss": 0.6865,
366
  "step": 250
367
  },
368
  {
369
  "epoch": 0.7053941908713693,
370
+ "grad_norm": 0.1235489696264267,
371
  "learning_rate": 4.833006288481371e-05,
372
+ "loss": 0.6807,
373
  "step": 255
374
  },
375
  {
376
  "epoch": 0.719225449515906,
377
+ "grad_norm": 0.13388977944850922,
378
  "learning_rate": 4.424158620314073e-05,
379
+ "loss": 0.6881,
380
  "step": 260
381
  },
382
  {
383
  "epoch": 0.7330567081604425,
384
+ "grad_norm": 0.12815245985984802,
385
  "learning_rate": 4.028414082972141e-05,
386
+ "loss": 0.6842,
387
  "step": 265
388
  },
389
  {
390
  "epoch": 0.7468879668049793,
391
+ "grad_norm": 0.1258043646812439,
392
  "learning_rate": 3.646702669275151e-05,
393
+ "loss": 0.6832,
394
  "step": 270
395
  },
396
  {
397
  "epoch": 0.7607192254495159,
398
+ "grad_norm": 0.11947453022003174,
399
  "learning_rate": 3.279921394444776e-05,
400
+ "loss": 0.6672,
401
  "step": 275
402
  },
403
  {
404
  "epoch": 0.7745504840940526,
405
+ "grad_norm": 0.12488783895969391,
406
  "learning_rate": 2.9289321881345254e-05,
407
+ "loss": 0.6729,
408
  "step": 280
409
  },
410
  {
411
  "epoch": 0.7883817427385892,
412
+ "grad_norm": 0.11996188759803772,
413
  "learning_rate": 2.594559868909956e-05,
414
+ "loss": 0.6641,
415
  "step": 285
416
  },
417
  {
418
  "epoch": 0.8022130013831259,
419
+ "grad_norm": 0.12338840216398239,
420
  "learning_rate": 2.2775902059393085e-05,
421
+ "loss": 0.6618,
422
  "step": 290
423
  },
424
  {
425
  "epoch": 0.8160442600276625,
426
+ "grad_norm": 0.11500907689332962,
427
  "learning_rate": 1.9787680724495617e-05,
428
+ "loss": 0.6576,
429
  "step": 295
430
  },
431
  {
432
  "epoch": 0.8298755186721992,
433
+ "grad_norm": 0.1203397586941719,
434
  "learning_rate": 1.698795695287212e-05,
435
+ "loss": 0.6579,
436
  "step": 300
437
  },
438
  {
439
  "epoch": 0.8437067773167358,
440
+ "grad_norm": 0.11593286693096161,
441
  "learning_rate": 1.4383310046973365e-05,
442
+ "loss": 0.659,
443
  "step": 305
444
  },
445
  {
446
  "epoch": 0.8575380359612724,
447
+ "grad_norm": 0.10674016922712326,
448
  "learning_rate": 1.1979860881988902e-05,
449
+ "loss": 0.6581,
450
  "step": 310
451
  },
452
  {
453
  "epoch": 0.8713692946058091,
454
+ "grad_norm": 0.1114317774772644,
455
  "learning_rate": 9.783257521896227e-06,
456
+ "loss": 0.6489,
457
  "step": 315
458
  },
459
  {
460
  "epoch": 0.8852005532503457,
461
+ "grad_norm": 0.11088614910840988,
462
  "learning_rate": 7.798661946608166e-06,
463
+ "loss": 0.6485,
464
  "step": 320
465
  },
466
  {
467
  "epoch": 0.8990318118948825,
468
+ "grad_norm": 0.10715563595294952,
469
  "learning_rate": 6.030737921409169e-06,
470
+ "loss": 0.645,
471
  "step": 325
472
  },
473
  {
474
  "epoch": 0.9128630705394191,
475
+ "grad_norm": 0.11442163586616516,
476
  "learning_rate": 4.4836400371876974e-06,
477
+ "loss": 0.64,
478
  "step": 330
479
  },
480
  {
481
  "epoch": 0.9266943291839558,
482
+ "grad_norm": 0.1089484840631485,
483
  "learning_rate": 3.161003947219421e-06,
484
+ "loss": 0.6336,
485
  "step": 335
486
  },
487
  {
488
  "epoch": 0.9405255878284924,
489
+ "grad_norm": 0.10584916174411774,
490
  "learning_rate": 2.0659378234448525e-06,
491
+ "loss": 0.665,
492
  "step": 340
493
  },
494
  {
495
  "epoch": 0.9543568464730291,
496
+ "grad_norm": 0.10534138232469559,
497
  "learning_rate": 1.201015052319099e-06,
498
+ "loss": 0.6455,
499
  "step": 345
500
  },
501
  {
502
  "epoch": 0.9681881051175657,
503
+ "grad_norm": 0.1038522943854332,
504
  "learning_rate": 5.682681873981577e-07,
505
+ "loss": 0.6406,
506
  "step": 350
507
  },
508
  {
509
  "epoch": 0.9820193637621023,
510
+ "grad_norm": 0.10471897572278976,
511
  "learning_rate": 1.6918417287318245e-07,
512
+ "loss": 0.6396,
513
  "step": 355
514
  },
515
  {
516
  "epoch": 0.995850622406639,
517
+ "grad_norm": 0.10800525546073914,
518
  "learning_rate": 4.700849277383679e-09,
519
+ "loss": 0.6434,
520
  "step": 360
521
  },
522
  {
523
  "epoch": 0.9986168741355463,
524
+ "eval_loss": 0.6599090695381165,
525
+ "eval_runtime": 53.0431,
526
+ "eval_samples_per_second": 86.91,
527
+ "eval_steps_per_second": 2.734,
528
  "step": 361
529
  },
530
  {
531
  "epoch": 0.9986168741355463,
532
  "step": 361,
533
+ "total_flos": 1.7115790489220547e+18,
534
+ "train_loss": 0.861595592175164,
535
+ "train_runtime": 2353.9448,
536
+ "train_samples_per_second": 19.656,
537
+ "train_steps_per_second": 0.153
538
  }
539
  ],
540
  "logging_steps": 5,
 
554
  "attributes": {}
555
  }
556
  },
557
+ "total_flos": 1.7115790489220547e+18,
558
  "train_batch_size": 4,
559
  "trial_name": null,
560
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:475f29db775c3fa3ebae0c3997a227d93f50e4e631d281907a24df8c23250da0
3
  size 7096
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4394984455d4ffe3e51e3b2431658cf9b616f4718e0ca4da0047bdbe4ff3859e
3
  size 7096