shorecode commited on
Commit
b1aecd0
·
verified ·
1 Parent(s): ebef2a4

Upload folder using huggingface_hub

Browse files
checkpoint-2000/README.md CHANGED
@@ -1,9 +1,9 @@
1
  ---
2
- base_model: google/gemma-3-1b-it
3
  library_name: peft
4
  pipeline_tag: text-generation
5
  tags:
6
- - base_model:adapter:google/gemma-3-1b-it
7
  - lora
8
  - sft
9
  - transformers
 
1
  ---
2
+ base_model: shorecode/gemma-3-svg-generator-lora-xla
3
  library_name: peft
4
  pipeline_tag: text-generation
5
  tags:
6
+ - base_model:adapter:shorecode/gemma-3-svg-generator-lora-xla
7
  - lora
8
  - sft
9
  - transformers
checkpoint-2000/adapter_config.json CHANGED
@@ -3,7 +3,7 @@
3
  "alpha_pattern": {},
4
  "arrow_config": null,
5
  "auto_mapping": null,
6
- "base_model_name_or_path": "google/gemma-3-1b-it",
7
  "bias": "none",
8
  "corda_config": null,
9
  "ensure_weight_tying": false,
@@ -31,12 +31,12 @@
31
  "rank_pattern": {},
32
  "revision": null,
33
  "target_modules": [
34
- "k_proj",
35
- "down_proj",
36
  "v_proj",
37
- "up_proj",
38
- "o_proj",
39
  "gate_proj",
 
 
40
  "q_proj"
41
  ],
42
  "target_parameters": null,
 
3
  "alpha_pattern": {},
4
  "arrow_config": null,
5
  "auto_mapping": null,
6
+ "base_model_name_or_path": "shorecode/gemma-3-svg-generator-lora-xla",
7
  "bias": "none",
8
  "corda_config": null,
9
  "ensure_weight_tying": false,
 
31
  "rank_pattern": {},
32
  "revision": null,
33
  "target_modules": [
 
 
34
  "v_proj",
35
+ "down_proj",
36
+ "k_proj",
37
  "gate_proj",
38
+ "o_proj",
39
+ "up_proj",
40
  "q_proj"
41
  ],
42
  "target_parameters": null,
checkpoint-2000/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eb4c881b22e201369a998a2269f505391c3df3071f452a00372d18ad8b7acfe8
3
- size 4936309960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5812b45db7d0efe0329e95bd318cd43609e615ba7438b10cf201f5ddeed156d
3
+ size 4936273096
checkpoint-2000/added_tokens.json CHANGED
@@ -1,5 +1,3 @@
1
  {
2
- "</think>": 262146,
3
- "<image_soft_token>": 262144,
4
- "<think>": 262145
5
  }
 
1
  {
2
+ "<image_soft_token>": 262144
 
 
3
  }
checkpoint-2000/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aa2f2822b39ac9940117e52b893f71391ce8be3b5c9193ba268a07ada03a97d1
3
- size 2624975850
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ce2877b6c340c5baf746a08803290fcd1c4c2ca4af95b7a3811f8a0de13b0f6
3
+ size 2624957418
checkpoint-2000/special_tokens_map.json CHANGED
@@ -1,20 +1,4 @@
1
  {
2
- "additional_special_tokens": [
3
- {
4
- "content": "<think>",
5
- "lstrip": false,
6
- "normalized": false,
7
- "rstrip": false,
8
- "single_word": false
9
- },
10
- {
11
- "content": "</think>",
12
- "lstrip": false,
13
- "normalized": false,
14
- "rstrip": false,
15
- "single_word": false
16
- }
17
- ],
18
  "boi_token": "<start_of_image>",
19
  "bos_token": {
20
  "content": "<bos>",
 
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "boi_token": "<start_of_image>",
3
  "bos_token": {
4
  "content": "<bos>",
checkpoint-2000/tokenizer.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:48b0c1cd0578eea659e9d9249b1c575599a2a172bfaa9130c92d0e155f3b9fe0
3
- size 33384937
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4667f2089529e8e7657cfb6d1c19910ae71ff5f28aa7ab2ff2763330affad795
3
+ size 33384568
checkpoint-2000/tokenizer_config.json CHANGED
@@ -51321,28 +51321,8 @@
51321
  "rstrip": false,
51322
  "single_word": false,
51323
  "special": true
51324
- },
51325
- "262145": {
51326
- "content": "<think>",
51327
- "lstrip": false,
51328
- "normalized": false,
51329
- "rstrip": false,
51330
- "single_word": false,
51331
- "special": true
51332
- },
51333
- "262146": {
51334
- "content": "</think>",
51335
- "lstrip": false,
51336
- "normalized": false,
51337
- "rstrip": false,
51338
- "single_word": false,
51339
- "special": true
51340
  }
51341
  },
51342
- "additional_special_tokens": [
51343
- "<think>",
51344
- "</think>"
51345
- ],
51346
  "boi_token": "<start_of_image>",
51347
  "bos_token": "<bos>",
51348
  "clean_up_tokenization_spaces": false,
 
51321
  "rstrip": false,
51322
  "single_word": false,
51323
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51324
  }
51325
  },
 
 
 
 
51326
  "boi_token": "<start_of_image>",
51327
  "bos_token": "<bos>",
51328
  "clean_up_tokenization_spaces": false,
checkpoint-2000/trainer_state.json CHANGED
@@ -10,423 +10,423 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "entropy": 0.362657470703125,
14
  "epoch": 0.06148170919151552,
15
- "grad_norm": 0.21484375,
16
  "learning_rate": 0.0002,
17
- "loss": 0.365,
18
- "mean_token_accuracy": 0.920078125,
19
  "num_tokens": 102400.0,
20
  "step": 100
21
  },
22
  {
23
  "epoch": 0.06148170919151552,
24
- "eval_entropy": 0.15455180637085636,
25
- "eval_loss": 0.12699459493160248,
26
- "eval_mean_token_accuracy": 0.9715771754143646,
27
  "eval_num_tokens": 102400.0,
28
- "eval_runtime": 58.9889,
29
- "eval_samples_per_second": 6.137,
30
- "eval_steps_per_second": 6.137,
31
  "step": 100
32
  },
33
  {
34
- "entropy": 0.129871826171875,
35
  "epoch": 0.12296341838303104,
36
- "grad_norm": 0.2158203125,
37
  "learning_rate": 0.0002,
38
- "loss": 0.1138,
39
- "mean_token_accuracy": 0.97357421875,
40
  "num_tokens": 204739.0,
41
  "step": 200
42
  },
43
  {
44
  "epoch": 0.12296341838303104,
45
- "eval_entropy": 0.10937230231353591,
46
- "eval_loss": 0.11088907718658447,
47
- "eval_mean_token_accuracy": 0.9744475138121547,
48
  "eval_num_tokens": 204739.0,
49
- "eval_runtime": 45.6439,
50
- "eval_samples_per_second": 7.931,
51
- "eval_steps_per_second": 7.931,
52
  "step": 200
53
  },
54
  {
55
- "entropy": 0.112332763671875,
56
  "epoch": 0.18444512757454656,
57
- "grad_norm": 0.3515625,
58
  "learning_rate": 0.0002,
59
- "loss": 0.0994,
60
- "mean_token_accuracy": 0.976171875,
61
  "num_tokens": 307139.0,
62
  "step": 300
63
  },
64
  {
65
  "epoch": 0.18444512757454656,
66
- "eval_entropy": 0.11139961369129835,
67
- "eval_loss": 0.10375571995973587,
68
- "eval_mean_token_accuracy": 0.9751057493093923,
69
  "eval_num_tokens": 307139.0,
70
- "eval_runtime": 44.7353,
71
- "eval_samples_per_second": 8.092,
72
- "eval_steps_per_second": 8.092,
73
  "step": 300
74
  },
75
  {
76
- "entropy": 0.113790283203125,
77
  "epoch": 0.24592683676606208,
78
- "grad_norm": 0.47265625,
79
  "learning_rate": 0.0002,
80
- "loss": 0.1013,
81
- "mean_token_accuracy": 0.97640625,
82
  "num_tokens": 409539.0,
83
  "step": 400
84
  },
85
  {
86
  "epoch": 0.24592683676606208,
87
- "eval_entropy": 0.11420487040314227,
88
- "eval_loss": 0.099822998046875,
89
- "eval_mean_token_accuracy": 0.97564528660221,
90
  "eval_num_tokens": 409539.0,
91
- "eval_runtime": 45.4755,
92
- "eval_samples_per_second": 7.96,
93
- "eval_steps_per_second": 7.96,
94
  "step": 400
95
  },
96
  {
97
- "entropy": 0.1103631591796875,
98
  "epoch": 0.3074085459575776,
99
- "grad_norm": 0.203125,
100
  "learning_rate": 0.0002,
101
- "loss": 0.1006,
102
- "mean_token_accuracy": 0.9761328125,
103
  "num_tokens": 511939.0,
104
  "step": 500
105
  },
106
  {
107
  "epoch": 0.3074085459575776,
108
- "eval_entropy": 0.10883040454506215,
109
- "eval_loss": 0.09676331281661987,
110
- "eval_mean_token_accuracy": 0.9761200794198895,
111
  "eval_num_tokens": 511939.0,
112
- "eval_runtime": 45.514,
113
- "eval_samples_per_second": 7.954,
114
- "eval_steps_per_second": 7.954,
115
  "step": 500
116
  },
117
  {
118
- "entropy": 0.100784912109375,
119
  "epoch": 0.3688902551490931,
120
- "grad_norm": 0.1826171875,
121
  "learning_rate": 0.0002,
122
- "loss": 0.0887,
123
- "mean_token_accuracy": 0.9786328125,
124
  "num_tokens": 614339.0,
125
  "step": 600
126
  },
127
  {
128
  "epoch": 0.3688902551490931,
129
- "eval_entropy": 0.1045960484288674,
130
- "eval_loss": 0.09669654816389084,
131
- "eval_mean_token_accuracy": 0.9765409185082873,
132
  "eval_num_tokens": 614339.0,
133
- "eval_runtime": 45.0249,
134
- "eval_samples_per_second": 8.04,
135
- "eval_steps_per_second": 8.04,
136
  "step": 600
137
  },
138
  {
139
- "entropy": 0.1064935302734375,
140
  "epoch": 0.43037196434060865,
141
- "grad_norm": 0.3359375,
142
  "learning_rate": 0.0002,
143
- "loss": 0.0925,
144
- "mean_token_accuracy": 0.9767578125,
145
  "num_tokens": 716739.0,
146
  "step": 700
147
  },
148
  {
149
  "epoch": 0.43037196434060865,
150
- "eval_entropy": 0.09553384517437845,
151
- "eval_loss": 0.09429358690977097,
152
- "eval_mean_token_accuracy": 0.976530127762431,
153
  "eval_num_tokens": 716739.0,
154
- "eval_runtime": 44.4869,
155
- "eval_samples_per_second": 8.137,
156
- "eval_steps_per_second": 8.137,
157
  "step": 700
158
  },
159
  {
160
- "entropy": 0.0933135986328125,
161
  "epoch": 0.49185367353212417,
162
- "grad_norm": 0.28125,
163
  "learning_rate": 0.0002,
164
- "loss": 0.0831,
165
- "mean_token_accuracy": 0.97962890625,
166
  "num_tokens": 819139.0,
167
  "step": 800
168
  },
169
  {
170
  "epoch": 0.49185367353212417,
171
- "eval_entropy": 0.09884896462793508,
172
- "eval_loss": 0.09294946491718292,
173
- "eval_mean_token_accuracy": 0.9765732907458563,
174
  "eval_num_tokens": 819139.0,
175
- "eval_runtime": 44.9598,
176
- "eval_samples_per_second": 8.052,
177
- "eval_steps_per_second": 8.052,
178
  "step": 800
179
  },
180
  {
181
- "entropy": 0.093392333984375,
182
  "epoch": 0.5533353827236397,
183
- "grad_norm": 0.2041015625,
184
  "learning_rate": 0.0002,
185
- "loss": 0.0813,
186
- "mean_token_accuracy": 0.98,
187
  "num_tokens": 921539.0,
188
  "step": 900
189
  },
190
  {
191
  "epoch": 0.5533353827236397,
192
- "eval_entropy": 0.09274511179212708,
193
- "eval_loss": 0.09229105710983276,
194
- "eval_mean_token_accuracy": 0.9766596167127072,
195
  "eval_num_tokens": 921539.0,
196
- "eval_runtime": 45.0568,
197
- "eval_samples_per_second": 8.034,
198
- "eval_steps_per_second": 8.034,
199
  "step": 900
200
  },
201
  {
202
- "entropy": 0.095006103515625,
203
  "epoch": 0.6148170919151552,
204
- "grad_norm": 0.16015625,
205
  "learning_rate": 0.0002,
206
- "loss": 0.0869,
207
- "mean_token_accuracy": 0.97876953125,
208
  "num_tokens": 1023939.0,
209
  "step": 1000
210
  },
211
  {
212
  "epoch": 0.6148170919151552,
213
- "eval_entropy": 0.1029599015883978,
214
- "eval_loss": 0.09008755534887314,
215
- "eval_mean_token_accuracy": 0.9774257596685083,
216
  "eval_num_tokens": 1023939.0,
217
- "eval_runtime": 44.9809,
218
- "eval_samples_per_second": 8.048,
219
- "eval_steps_per_second": 8.048,
220
  "step": 1000
221
  },
222
  {
223
- "entropy": 0.1004425048828125,
224
  "epoch": 0.6762988011066707,
225
- "grad_norm": 0.2353515625,
226
  "learning_rate": 0.0002,
227
- "loss": 0.0875,
228
- "mean_token_accuracy": 0.9771875,
229
  "num_tokens": 1126339.0,
230
  "step": 1100
231
  },
232
  {
233
  "epoch": 0.6762988011066707,
234
- "eval_entropy": 0.09525227414968923,
235
- "eval_loss": 0.09049726277589798,
236
- "eval_mean_token_accuracy": 0.9773178522099447,
237
  "eval_num_tokens": 1126339.0,
238
- "eval_runtime": 45.9151,
239
- "eval_samples_per_second": 7.884,
240
- "eval_steps_per_second": 7.884,
241
  "step": 1100
242
  },
243
  {
244
- "entropy": 0.09235107421875,
245
  "epoch": 0.7377805102981863,
246
- "grad_norm": 0.2255859375,
247
  "learning_rate": 0.0002,
248
- "loss": 0.0831,
249
- "mean_token_accuracy": 0.97900390625,
250
  "num_tokens": 1228739.0,
251
  "step": 1200
252
  },
253
  {
254
  "epoch": 0.7377805102981863,
255
- "eval_entropy": 0.09683514168249309,
256
- "eval_loss": 0.0896323174238205,
257
- "eval_mean_token_accuracy": 0.977263898480663,
258
  "eval_num_tokens": 1228739.0,
259
- "eval_runtime": 44.6006,
260
- "eval_samples_per_second": 8.116,
261
- "eval_steps_per_second": 8.116,
262
  "step": 1200
263
  },
264
  {
265
- "entropy": 0.0924603271484375,
266
  "epoch": 0.7992622194897018,
267
- "grad_norm": 0.2177734375,
268
  "learning_rate": 0.0002,
269
- "loss": 0.0813,
270
- "mean_token_accuracy": 0.97939453125,
271
  "num_tokens": 1331139.0,
272
  "step": 1300
273
  },
274
  {
275
  "epoch": 0.7992622194897018,
276
- "eval_entropy": 0.09229459815262431,
277
- "eval_loss": 0.08921755105257034,
278
- "eval_mean_token_accuracy": 0.9773070614640884,
279
  "eval_num_tokens": 1331139.0,
280
- "eval_runtime": 46.1046,
281
- "eval_samples_per_second": 7.852,
282
- "eval_steps_per_second": 7.852,
283
  "step": 1300
284
  },
285
  {
286
- "entropy": 0.0900238037109375,
287
  "epoch": 0.8607439286812173,
288
- "grad_norm": 0.1416015625,
289
  "learning_rate": 0.0002,
290
- "loss": 0.0819,
291
- "mean_token_accuracy": 0.97998046875,
292
  "num_tokens": 1433475.0,
293
  "step": 1400
294
  },
295
  {
296
  "epoch": 0.8607439286812173,
297
- "eval_entropy": 0.09326273038242404,
298
- "eval_loss": 0.08730876445770264,
299
- "eval_mean_token_accuracy": 0.9778789709944752,
300
  "eval_num_tokens": 1433475.0,
301
- "eval_runtime": 46.2555,
302
- "eval_samples_per_second": 7.826,
303
- "eval_steps_per_second": 7.826,
304
  "step": 1400
305
  },
306
  {
307
- "entropy": 0.08128265380859374,
308
  "epoch": 0.9222256378727328,
309
- "grad_norm": 0.12890625,
310
  "learning_rate": 0.0002,
311
- "loss": 0.0737,
312
- "mean_token_accuracy": 0.98046875,
313
  "num_tokens": 1535875.0,
314
  "step": 1500
315
  },
316
  {
317
  "epoch": 0.9222256378727328,
318
- "eval_entropy": 0.08759505972677832,
319
- "eval_loss": 0.08772876113653183,
320
- "eval_mean_token_accuracy": 0.97747971339779,
321
  "eval_num_tokens": 1535875.0,
322
- "eval_runtime": 46.2271,
323
- "eval_samples_per_second": 7.831,
324
- "eval_steps_per_second": 7.831,
325
  "step": 1500
326
  },
327
  {
328
- "entropy": 0.0855340576171875,
329
  "epoch": 0.9837073470642483,
330
- "grad_norm": 0.2109375,
331
  "learning_rate": 0.0002,
332
- "loss": 0.0797,
333
- "mean_token_accuracy": 0.979765625,
334
  "num_tokens": 1638213.0,
335
  "step": 1600
336
  },
337
  {
338
  "epoch": 0.9837073470642483,
339
- "eval_entropy": 0.10621263704247237,
340
- "eval_loss": 0.08904456347227097,
341
- "eval_mean_token_accuracy": 0.9773178522099447,
342
  "eval_num_tokens": 1638213.0,
343
- "eval_runtime": 46.6372,
344
- "eval_samples_per_second": 7.762,
345
- "eval_steps_per_second": 7.762,
346
  "step": 1600
347
  },
348
  {
349
- "entropy": 0.07732387523555276,
350
  "epoch": 1.0448816477098064,
351
- "grad_norm": 0.205078125,
352
  "learning_rate": 0.0002,
353
- "loss": 0.0663,
354
- "mean_token_accuracy": 0.9825298366834171,
355
  "num_tokens": 1740101.0,
356
  "step": 1700
357
  },
358
  {
359
  "epoch": 1.0448816477098064,
360
- "eval_entropy": 0.08320103429299033,
361
- "eval_loss": 0.08749625831842422,
362
- "eval_mean_token_accuracy": 0.9779437154696132,
363
  "eval_num_tokens": 1740101.0,
364
- "eval_runtime": 46.4346,
365
- "eval_samples_per_second": 7.796,
366
- "eval_steps_per_second": 7.796,
367
  "step": 1700
368
  },
369
  {
370
- "entropy": 0.07719329833984374,
371
  "epoch": 1.1063633569013218,
372
- "grad_norm": 0.1923828125,
373
  "learning_rate": 0.0002,
374
- "loss": 0.0731,
375
- "mean_token_accuracy": 0.9812890625,
376
  "num_tokens": 1842501.0,
377
  "step": 1800
378
  },
379
  {
380
  "epoch": 1.1063633569013218,
381
- "eval_entropy": 0.07808470330844268,
382
- "eval_loss": 0.08772134780883789,
383
- "eval_mean_token_accuracy": 0.9776847375690608,
384
  "eval_num_tokens": 1842501.0,
385
- "eval_runtime": 46.8283,
386
- "eval_samples_per_second": 7.73,
387
- "eval_steps_per_second": 7.73,
388
  "step": 1800
389
  },
390
  {
391
- "entropy": 0.07139556884765624,
392
  "epoch": 1.1678450660928374,
393
- "grad_norm": 0.240234375,
394
  "learning_rate": 0.0002,
395
- "loss": 0.0663,
396
- "mean_token_accuracy": 0.9823046875,
397
  "num_tokens": 1944901.0,
398
  "step": 1900
399
  },
400
  {
401
  "epoch": 1.1678450660928374,
402
- "eval_entropy": 0.08272489263207873,
403
- "eval_loss": 0.08744163066148758,
404
- "eval_mean_token_accuracy": 0.9776739468232044,
405
  "eval_num_tokens": 1944901.0,
406
- "eval_runtime": 46.5748,
407
- "eval_samples_per_second": 7.772,
408
- "eval_steps_per_second": 7.772,
409
  "step": 1900
410
  },
411
  {
412
- "entropy": 0.0790863037109375,
413
  "epoch": 1.2293267752843529,
414
- "grad_norm": 0.302734375,
415
  "learning_rate": 0.0002,
416
- "loss": 0.0681,
417
- "mean_token_accuracy": 0.981484375,
418
  "num_tokens": 2047301.0,
419
  "step": 2000
420
  },
421
  {
422
  "epoch": 1.2293267752843529,
423
- "eval_entropy": 0.08277783472893646,
424
- "eval_loss": 0.08769083023071289,
425
- "eval_mean_token_accuracy": 0.9780732044198895,
426
  "eval_num_tokens": 2047301.0,
427
- "eval_runtime": 46.2216,
428
- "eval_samples_per_second": 7.832,
429
- "eval_steps_per_second": 7.832,
430
  "step": 2000
431
  }
432
  ],
@@ -447,7 +447,7 @@
447
  "attributes": {}
448
  }
449
  },
450
- "total_flos": 1.2602950679251968e+16,
451
  "train_batch_size": 1,
452
  "trial_name": null,
453
  "trial_params": null
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "entropy": 0.06958419799804688,
14
  "epoch": 0.06148170919151552,
15
+ "grad_norm": 0.12353515625,
16
  "learning_rate": 0.0002,
17
+ "loss": 0.0684,
18
+ "mean_token_accuracy": 0.98111328125,
19
  "num_tokens": 102400.0,
20
  "step": 100
21
  },
22
  {
23
  "epoch": 0.06148170919151552,
24
+ "eval_entropy": 0.06598355625215815,
25
+ "eval_loss": 0.09076400101184845,
26
+ "eval_mean_token_accuracy": 0.9780192506906077,
27
  "eval_num_tokens": 102400.0,
28
+ "eval_runtime": 58.8573,
29
+ "eval_samples_per_second": 6.15,
30
+ "eval_steps_per_second": 6.15,
31
  "step": 100
32
  },
33
  {
34
+ "entropy": 0.05923065185546875,
35
  "epoch": 0.12296341838303104,
36
+ "grad_norm": 0.140625,
37
  "learning_rate": 0.0002,
38
+ "loss": 0.0597,
39
+ "mean_token_accuracy": 0.9837109375,
40
  "num_tokens": 204739.0,
41
  "step": 200
42
  },
43
  {
44
  "epoch": 0.12296341838303104,
45
+ "eval_entropy": 0.0683226190219268,
46
+ "eval_loss": 0.0902482345700264,
47
+ "eval_mean_token_accuracy": 0.9778681802486188,
48
  "eval_num_tokens": 204739.0,
49
+ "eval_runtime": 45.3855,
50
+ "eval_samples_per_second": 7.976,
51
+ "eval_steps_per_second": 7.976,
52
  "step": 200
53
  },
54
  {
55
+ "entropy": 0.0568572998046875,
56
  "epoch": 0.18444512757454656,
57
+ "grad_norm": 0.1689453125,
58
  "learning_rate": 0.0002,
59
+ "loss": 0.0531,
60
+ "mean_token_accuracy": 0.98462890625,
61
  "num_tokens": 307139.0,
62
  "step": 300
63
  },
64
  {
65
  "epoch": 0.18444512757454656,
66
+ "eval_entropy": 0.06631065073592887,
67
+ "eval_loss": 0.09246724843978882,
68
+ "eval_mean_token_accuracy": 0.9779005524861878,
69
  "eval_num_tokens": 307139.0,
70
+ "eval_runtime": 45.6788,
71
+ "eval_samples_per_second": 7.925,
72
+ "eval_steps_per_second": 7.925,
73
  "step": 300
74
  },
75
  {
76
+ "entropy": 0.06324066162109375,
77
  "epoch": 0.24592683676606208,
78
+ "grad_norm": 0.21875,
79
  "learning_rate": 0.0002,
80
+ "loss": 0.0581,
81
+ "mean_token_accuracy": 0.98373046875,
82
  "num_tokens": 409539.0,
83
  "step": 400
84
  },
85
  {
86
  "epoch": 0.24592683676606208,
87
+ "eval_entropy": 0.07637689917127072,
88
+ "eval_loss": 0.08831820636987686,
89
+ "eval_mean_token_accuracy": 0.9779868784530387,
90
  "eval_num_tokens": 409539.0,
91
+ "eval_runtime": 45.2583,
92
+ "eval_samples_per_second": 7.999,
93
+ "eval_steps_per_second": 7.999,
94
  "step": 400
95
  },
96
  {
97
+ "entropy": 0.06784759521484375,
98
  "epoch": 0.3074085459575776,
99
+ "grad_norm": 0.1826171875,
100
  "learning_rate": 0.0002,
101
+ "loss": 0.0584,
102
+ "mean_token_accuracy": 0.98369140625,
103
  "num_tokens": 511939.0,
104
  "step": 500
105
  },
106
  {
107
  "epoch": 0.3074085459575776,
108
+ "eval_entropy": 0.07653134172133978,
109
+ "eval_loss": 0.08888135105371475,
110
+ "eval_mean_token_accuracy": 0.9780300414364641,
111
  "eval_num_tokens": 511939.0,
112
+ "eval_runtime": 45.6823,
113
+ "eval_samples_per_second": 7.924,
114
+ "eval_steps_per_second": 7.924,
115
  "step": 500
116
  },
117
  {
118
+ "entropy": 0.065264892578125,
119
  "epoch": 0.3688902551490931,
120
+ "grad_norm": 0.193359375,
121
  "learning_rate": 0.0002,
122
+ "loss": 0.0516,
123
+ "mean_token_accuracy": 0.98486328125,
124
  "num_tokens": 614339.0,
125
  "step": 600
126
  },
127
  {
128
  "epoch": 0.3688902551490931,
129
+ "eval_entropy": 0.07568089606353591,
130
+ "eval_loss": 0.09014976769685745,
131
+ "eval_mean_token_accuracy": 0.9778897617403315,
132
  "eval_num_tokens": 614339.0,
133
+ "eval_runtime": 45.9066,
134
+ "eval_samples_per_second": 7.886,
135
+ "eval_steps_per_second": 7.886,
136
  "step": 600
137
  },
138
  {
139
+ "entropy": 0.0712152099609375,
140
  "epoch": 0.43037196434060865,
141
+ "grad_norm": 0.2421875,
142
  "learning_rate": 0.0002,
143
+ "loss": 0.0584,
144
+ "mean_token_accuracy": 0.983046875,
145
  "num_tokens": 716739.0,
146
  "step": 700
147
  },
148
  {
149
  "epoch": 0.43037196434060865,
150
+ "eval_entropy": 0.07403969106094613,
151
+ "eval_loss": 0.09142003953456879,
152
+ "eval_mean_token_accuracy": 0.977965296961326,
153
  "eval_num_tokens": 716739.0,
154
+ "eval_runtime": 45.4178,
155
+ "eval_samples_per_second": 7.97,
156
+ "eval_steps_per_second": 7.97,
157
  "step": 700
158
  },
159
  {
160
+ "entropy": 0.0648382568359375,
161
  "epoch": 0.49185367353212417,
162
+ "grad_norm": 0.162109375,
163
  "learning_rate": 0.0002,
164
+ "loss": 0.0512,
165
+ "mean_token_accuracy": 0.9851953125,
166
  "num_tokens": 819139.0,
167
  "step": 800
168
  },
169
  {
170
  "epoch": 0.49185367353212417,
171
+ "eval_entropy": 0.07564245403142265,
172
+ "eval_loss": 0.08956116437911987,
173
+ "eval_mean_token_accuracy": 0.977846598756906,
174
  "eval_num_tokens": 819139.0,
175
+ "eval_runtime": 45.1314,
176
+ "eval_samples_per_second": 8.021,
177
+ "eval_steps_per_second": 8.021,
178
  "step": 800
179
  },
180
  {
181
+ "entropy": 0.0652081298828125,
182
  "epoch": 0.5533353827236397,
183
+ "grad_norm": 0.1474609375,
184
  "learning_rate": 0.0002,
185
+ "loss": 0.0503,
186
+ "mean_token_accuracy": 0.98572265625,
187
  "num_tokens": 921539.0,
188
  "step": 900
189
  },
190
  {
191
  "epoch": 0.5533353827236397,
192
+ "eval_entropy": 0.07643860874913674,
193
+ "eval_loss": 0.09081660211086273,
194
+ "eval_mean_token_accuracy": 0.977781854281768,
195
  "eval_num_tokens": 921539.0,
196
+ "eval_runtime": 45.2589,
197
+ "eval_samples_per_second": 7.998,
198
+ "eval_steps_per_second": 7.998,
199
  "step": 900
200
  },
201
  {
202
+ "entropy": 0.0689361572265625,
203
  "epoch": 0.6148170919151552,
204
+ "grad_norm": 0.1279296875,
205
  "learning_rate": 0.0002,
206
+ "loss": 0.0547,
207
+ "mean_token_accuracy": 0.98447265625,
208
  "num_tokens": 1023939.0,
209
  "step": 1000
210
  },
211
  {
212
  "epoch": 0.6148170919151552,
213
+ "eval_entropy": 0.07796246439053868,
214
+ "eval_loss": 0.09036962687969208,
215
+ "eval_mean_token_accuracy": 0.9782674378453039,
216
  "eval_num_tokens": 1023939.0,
217
+ "eval_runtime": 45.7559,
218
+ "eval_samples_per_second": 7.912,
219
+ "eval_steps_per_second": 7.912,
220
  "step": 1000
221
  },
222
  {
223
+ "entropy": 0.07148681640625,
224
  "epoch": 0.6762988011066707,
225
+ "grad_norm": 0.06884765625,
226
  "learning_rate": 0.0002,
227
+ "loss": 0.0556,
228
+ "mean_token_accuracy": 0.98369140625,
229
  "num_tokens": 1126339.0,
230
  "step": 1100
231
  },
232
  {
233
  "epoch": 0.6762988011066707,
234
+ "eval_entropy": 0.07977396085117404,
235
+ "eval_loss": 0.0895102471113205,
236
+ "eval_mean_token_accuracy": 0.9782026933701657,
237
  "eval_num_tokens": 1126339.0,
238
+ "eval_runtime": 45.3177,
239
+ "eval_samples_per_second": 7.988,
240
+ "eval_steps_per_second": 7.988,
241
  "step": 1100
242
  },
243
  {
244
+ "entropy": 0.0679693603515625,
245
  "epoch": 0.7377805102981863,
246
+ "grad_norm": 0.08837890625,
247
  "learning_rate": 0.0002,
248
+ "loss": 0.0534,
249
+ "mean_token_accuracy": 0.98482421875,
250
  "num_tokens": 1228739.0,
251
  "step": 1200
252
  },
253
  {
254
  "epoch": 0.7377805102981863,
255
+ "eval_entropy": 0.07723847278573895,
256
+ "eval_loss": 0.09167025238275528,
257
+ "eval_mean_token_accuracy": 0.9781595303867403,
258
  "eval_num_tokens": 1228739.0,
259
+ "eval_runtime": 45.0604,
260
+ "eval_samples_per_second": 8.034,
261
+ "eval_steps_per_second": 8.034,
262
  "step": 1200
263
  },
264
  {
265
+ "entropy": 0.0679254150390625,
266
  "epoch": 0.7992622194897018,
267
+ "grad_norm": 0.2158203125,
268
  "learning_rate": 0.0002,
269
+ "loss": 0.0537,
270
+ "mean_token_accuracy": 0.98462890625,
271
  "num_tokens": 1331139.0,
272
  "step": 1300
273
  },
274
  {
275
  "epoch": 0.7992622194897018,
276
+ "eval_entropy": 0.07550386038933012,
277
+ "eval_loss": 0.09077057242393494,
278
+ "eval_mean_token_accuracy": 0.9782026933701657,
279
  "eval_num_tokens": 1331139.0,
280
+ "eval_runtime": 45.1835,
281
+ "eval_samples_per_second": 8.012,
282
+ "eval_steps_per_second": 8.012,
283
  "step": 1300
284
  },
285
  {
286
+ "entropy": 0.0667388916015625,
287
  "epoch": 0.8607439286812173,
288
+ "grad_norm": 0.166015625,
289
  "learning_rate": 0.0002,
290
+ "loss": 0.0497,
291
+ "mean_token_accuracy": 0.98517578125,
292
  "num_tokens": 1433475.0,
293
  "step": 1400
294
  },
295
  {
296
  "epoch": 0.8607439286812173,
297
+ "eval_entropy": 0.07443018117662292,
298
+ "eval_loss": 0.0910056084394455,
299
+ "eval_mean_token_accuracy": 0.9782026933701657,
300
  "eval_num_tokens": 1433475.0,
301
+ "eval_runtime": 46.1219,
302
+ "eval_samples_per_second": 7.849,
303
+ "eval_steps_per_second": 7.849,
304
  "step": 1400
305
  },
306
  {
307
+ "entropy": 0.065689697265625,
308
  "epoch": 0.9222256378727328,
309
+ "grad_norm": 0.130859375,
310
  "learning_rate": 0.0002,
311
+ "loss": 0.0491,
312
+ "mean_token_accuracy": 0.985703125,
313
  "num_tokens": 1535875.0,
314
  "step": 1500
315
  },
316
  {
317
  "epoch": 0.9222256378727328,
318
+ "eval_entropy": 0.07454584448377072,
319
+ "eval_loss": 0.09148883074522018,
320
+ "eval_mean_token_accuracy": 0.9779976691988951,
321
  "eval_num_tokens": 1535875.0,
322
+ "eval_runtime": 45.4996,
323
+ "eval_samples_per_second": 7.956,
324
+ "eval_steps_per_second": 7.956,
325
  "step": 1500
326
  },
327
  {
328
+ "entropy": 0.06522216796875,
329
  "epoch": 0.9837073470642483,
330
+ "grad_norm": 0.234375,
331
  "learning_rate": 0.0002,
332
+ "loss": 0.0503,
333
+ "mean_token_accuracy": 0.98533203125,
334
  "num_tokens": 1638213.0,
335
  "step": 1600
336
  },
337
  {
338
  "epoch": 0.9837073470642483,
339
+ "eval_entropy": 0.08157905030645718,
340
+ "eval_loss": 0.08965592086315155,
341
+ "eval_mean_token_accuracy": 0.9780300414364641,
342
  "eval_num_tokens": 1638213.0,
343
+ "eval_runtime": 46.8838,
344
+ "eval_samples_per_second": 7.721,
345
+ "eval_steps_per_second": 7.721,
346
  "step": 1600
347
  },
348
  {
349
+ "entropy": 0.0635839107647613,
350
  "epoch": 1.0448816477098064,
351
+ "grad_norm": 0.166015625,
352
  "learning_rate": 0.0002,
353
+ "loss": 0.045,
354
+ "mean_token_accuracy": 0.9866323806532663,
355
  "num_tokens": 1740101.0,
356
  "step": 1700
357
  },
358
  {
359
  "epoch": 1.0448816477098064,
360
+ "eval_entropy": 0.07249357950621547,
361
+ "eval_loss": 0.09257431328296661,
362
+ "eval_mean_token_accuracy": 0.9786019509668509,
363
  "eval_num_tokens": 1740101.0,
364
+ "eval_runtime": 46.53,
365
+ "eval_samples_per_second": 7.78,
366
+ "eval_steps_per_second": 7.78,
367
  "step": 1700
368
  },
369
  {
370
+ "entropy": 0.0614752197265625,
371
  "epoch": 1.1063633569013218,
372
+ "grad_norm": 0.23828125,
373
  "learning_rate": 0.0002,
374
+ "loss": 0.0472,
375
+ "mean_token_accuracy": 0.98642578125,
376
  "num_tokens": 1842501.0,
377
  "step": 1800
378
  },
379
  {
380
  "epoch": 1.1063633569013218,
381
+ "eval_entropy": 0.06955445010359115,
382
+ "eval_loss": 0.09410356730222702,
383
+ "eval_mean_token_accuracy": 0.9780732044198895,
384
  "eval_num_tokens": 1842501.0,
385
+ "eval_runtime": 45.9671,
386
+ "eval_samples_per_second": 7.875,
387
+ "eval_steps_per_second": 7.875,
388
  "step": 1800
389
  },
390
  {
391
+ "entropy": 0.059293212890625,
392
  "epoch": 1.1678450660928374,
393
+ "grad_norm": 0.357421875,
394
  "learning_rate": 0.0002,
395
+ "loss": 0.0437,
396
+ "mean_token_accuracy": 0.986953125,
397
  "num_tokens": 1944901.0,
398
  "step": 1900
399
  },
400
  {
401
  "epoch": 1.1678450660928374,
402
+ "eval_entropy": 0.0688324817636395,
403
+ "eval_loss": 0.09622900187969208,
404
+ "eval_mean_token_accuracy": 0.9781379488950276,
405
  "eval_num_tokens": 1944901.0,
406
+ "eval_runtime": 46.9487,
407
+ "eval_samples_per_second": 7.711,
408
+ "eval_steps_per_second": 7.711,
409
  "step": 1900
410
  },
411
  {
412
+ "entropy": 0.063258056640625,
413
  "epoch": 1.2293267752843529,
414
+ "grad_norm": 0.34375,
415
  "learning_rate": 0.0002,
416
+ "loss": 0.0472,
417
+ "mean_token_accuracy": 0.9858984375,
418
  "num_tokens": 2047301.0,
419
  "step": 2000
420
  },
421
  {
422
  "epoch": 1.2293267752843529,
423
+ "eval_entropy": 0.06529750086325967,
424
+ "eval_loss": 0.09772183746099472,
425
+ "eval_mean_token_accuracy": 0.9782458563535912,
426
  "eval_num_tokens": 2047301.0,
427
+ "eval_runtime": 46.937,
428
+ "eval_samples_per_second": 7.712,
429
+ "eval_steps_per_second": 7.712,
430
  "step": 2000
431
  }
432
  ],
 
447
  "attributes": {}
448
  }
449
  },
450
+ "total_flos": 1.2602922377362944e+16,
451
  "train_batch_size": 1,
452
  "trial_name": null,
453
  "trial_params": null
checkpoint-2000/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:18c7d98fab55a6e8d1ce829252f9f0e44946e60f7037415c9effaf69f276a316
3
  size 6289
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:48b867a353950ebc919ed73d524747453141e001abb6f9f0f72a2a0f45975392
3
  size 6289