youssefedweqd commited on
Commit
90be0f5
·
verified ·
1 Parent(s): 13d8a8a

Training in progress, step 3700

Browse files
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0d62d6ae9aa86bcdcc7d3d242ac450960f9ce497ea0fa9e5ad148c0305577d59
3
  size 161533160
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:39bf1781e1025eb888d83521fecc3d9b52fe716fd15beb3542cda76db3dcc4ba
3
  size 161533160
last-checkpoint/adapter_config.json CHANGED
@@ -23,13 +23,13 @@
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
- "gate_proj",
27
  "q_proj",
28
  "v_proj",
29
- "o_proj",
30
- "down_proj",
31
  "k_proj",
32
- "up_proj"
 
 
 
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
 
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
 
26
  "q_proj",
27
  "v_proj",
 
 
28
  "k_proj",
29
+ "up_proj",
30
+ "down_proj",
31
+ "gate_proj",
32
+ "o_proj"
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0d62d6ae9aa86bcdcc7d3d242ac450960f9ce497ea0fa9e5ad148c0305577d59
3
  size 161533160
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:027c28cbacad0920c7a8ec1a4dbaf396f0658e37d9c57aa24903513cf568bf29
3
  size 161533160
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d7c7116503ae238c561f9beb3e928f6010b861d5a3819401abd2fb144fce9271
3
  size 323292202
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a811f08d635f9fd429d0ac8672eee899607dd871ece10f326b8ec3e7266d9db2
3
  size 323292202
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e277eb5876c65e49563b85b1f801a6e569dd5a5ee1b70cc75c8434fc96347f80
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:824d4a418ca52dbceab02ca3bdda11d00d54b246084fd87a75671a28233a0cb2
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.5960264900662252,
6
  "eval_steps": 100,
7
- "global_step": 3600,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -164,370 +164,6 @@
164
  "learning_rate": 5.513245033112583e-05,
165
  "loss": 0.7213,
166
  "step": 1000
167
- },
168
- {
169
- "epoch": 0.173841059602649,
170
- "grad_norm": 1.8289754390716553,
171
- "learning_rate": 5.789183222958058e-05,
172
- "loss": 0.7335,
173
- "step": 1050
174
- },
175
- {
176
- "epoch": 0.18211920529801323,
177
- "grad_norm": 1.4989681243896484,
178
- "learning_rate": 6.065121412803533e-05,
179
- "loss": 0.7326,
180
- "step": 1100
181
- },
182
- {
183
- "epoch": 0.19039735099337748,
184
- "grad_norm": 1.5326098203659058,
185
- "learning_rate": 6.341059602649006e-05,
186
- "loss": 0.7311,
187
- "step": 1150
188
- },
189
- {
190
- "epoch": 0.1986754966887417,
191
- "grad_norm": 1.4897147417068481,
192
- "learning_rate": 6.616997792494481e-05,
193
- "loss": 0.6918,
194
- "step": 1200
195
- },
196
- {
197
- "epoch": 0.20695364238410596,
198
- "grad_norm": 1.634765863418579,
199
- "learning_rate": 6.892935982339957e-05,
200
- "loss": 0.7051,
201
- "step": 1250
202
- },
203
- {
204
- "epoch": 0.2152317880794702,
205
- "grad_norm": 1.4463587999343872,
206
- "learning_rate": 7.168874172185431e-05,
207
- "loss": 0.6955,
208
- "step": 1300
209
- },
210
- {
211
- "epoch": 0.22350993377483444,
212
- "grad_norm": 1.632133960723877,
213
- "learning_rate": 7.444812362030905e-05,
214
- "loss": 0.6901,
215
- "step": 1350
216
- },
217
- {
218
- "epoch": 0.23178807947019867,
219
- "grad_norm": 1.4062328338623047,
220
- "learning_rate": 7.72075055187638e-05,
221
- "loss": 0.6833,
222
- "step": 1400
223
- },
224
- {
225
- "epoch": 0.24006622516556292,
226
- "grad_norm": 1.2914466857910156,
227
- "learning_rate": 7.996688741721855e-05,
228
- "loss": 0.6663,
229
- "step": 1450
230
- },
231
- {
232
- "epoch": 0.24834437086092714,
233
- "grad_norm": 1.4995919466018677,
234
- "learning_rate": 8.272626931567329e-05,
235
- "loss": 0.6959,
236
- "step": 1500
237
- },
238
- {
239
- "epoch": 0.25662251655629137,
240
- "grad_norm": 1.1299749612808228,
241
- "learning_rate": 8.548565121412803e-05,
242
- "loss": 0.6685,
243
- "step": 1550
244
- },
245
- {
246
- "epoch": 0.26490066225165565,
247
- "grad_norm": 1.329004168510437,
248
- "learning_rate": 8.824503311258279e-05,
249
- "loss": 0.6678,
250
- "step": 1600
251
- },
252
- {
253
- "epoch": 0.2731788079470199,
254
- "grad_norm": 1.5191948413848877,
255
- "learning_rate": 9.100441501103754e-05,
256
- "loss": 0.6731,
257
- "step": 1650
258
- },
259
- {
260
- "epoch": 0.2814569536423841,
261
- "grad_norm": 1.739169716835022,
262
- "learning_rate": 9.376379690949227e-05,
263
- "loss": 0.6691,
264
- "step": 1700
265
- },
266
- {
267
- "epoch": 0.2897350993377483,
268
- "grad_norm": 1.2906118631362915,
269
- "learning_rate": 9.652317880794703e-05,
270
- "loss": 0.6718,
271
- "step": 1750
272
- },
273
- {
274
- "epoch": 0.2980132450331126,
275
- "grad_norm": 1.289502501487732,
276
- "learning_rate": 9.928256070640178e-05,
277
- "loss": 0.6581,
278
- "step": 1800
279
- },
280
- {
281
- "epoch": 0.30629139072847683,
282
- "grad_norm": 1.3923128843307495,
283
- "learning_rate": 9.999872989402833e-05,
284
- "loss": 0.6589,
285
- "step": 1850
286
- },
287
- {
288
- "epoch": 0.31456953642384106,
289
- "grad_norm": 1.1048816442489624,
290
- "learning_rate": 9.999297790520483e-05,
291
- "loss": 0.6341,
292
- "step": 1900
293
- },
294
- {
295
- "epoch": 0.3228476821192053,
296
- "grad_norm": 1.3568603992462158,
297
- "learning_rate": 9.998258777484084e-05,
298
- "loss": 0.6318,
299
- "step": 1950
300
- },
301
- {
302
- "epoch": 0.33112582781456956,
303
- "grad_norm": 0.923786997795105,
304
- "learning_rate": 9.996756046688961e-05,
305
- "loss": 0.6318,
306
- "step": 2000
307
- },
308
- {
309
- "epoch": 0.3394039735099338,
310
- "grad_norm": 1.102367877960205,
311
- "learning_rate": 9.994789737552259e-05,
312
- "loss": 0.6193,
313
- "step": 2050
314
- },
315
- {
316
- "epoch": 0.347682119205298,
317
- "grad_norm": 1.0738896131515503,
318
- "learning_rate": 9.992360032500001e-05,
319
- "loss": 0.6184,
320
- "step": 2100
321
- },
322
- {
323
- "epoch": 0.35596026490066224,
324
- "grad_norm": 1.279288649559021,
325
- "learning_rate": 9.98946715695016e-05,
326
- "loss": 0.626,
327
- "step": 2150
328
- },
329
- {
330
- "epoch": 0.36423841059602646,
331
- "grad_norm": 1.2009036540985107,
332
- "learning_rate": 9.986111379291759e-05,
333
- "loss": 0.6305,
334
- "step": 2200
335
- },
336
- {
337
- "epoch": 0.37251655629139074,
338
- "grad_norm": 0.8177038431167603,
339
- "learning_rate": 9.982293010859955e-05,
340
- "loss": 0.6266,
341
- "step": 2250
342
- },
343
- {
344
- "epoch": 0.38079470198675497,
345
- "grad_norm": 1.2464983463287354,
346
- "learning_rate": 9.978012405907165e-05,
347
- "loss": 0.6148,
348
- "step": 2300
349
- },
350
- {
351
- "epoch": 0.3890728476821192,
352
- "grad_norm": 1.2841860055923462,
353
- "learning_rate": 9.973269961570195e-05,
354
- "loss": 0.5946,
355
- "step": 2350
356
- },
357
- {
358
- "epoch": 0.3973509933774834,
359
- "grad_norm": 1.2200431823730469,
360
- "learning_rate": 9.968066117833401e-05,
361
- "loss": 0.6166,
362
- "step": 2400
363
- },
364
- {
365
- "epoch": 0.4056291390728477,
366
- "grad_norm": 1.128247857093811,
367
- "learning_rate": 9.962401357487863e-05,
368
- "loss": 0.5992,
369
- "step": 2450
370
- },
371
- {
372
- "epoch": 0.4139072847682119,
373
- "grad_norm": 1.0683091878890991,
374
- "learning_rate": 9.956276206086597e-05,
375
- "loss": 0.6048,
376
- "step": 2500
377
- },
378
- {
379
- "epoch": 0.42218543046357615,
380
- "grad_norm": 1.1819758415222168,
381
- "learning_rate": 9.949691231895791e-05,
382
- "loss": 0.5944,
383
- "step": 2550
384
- },
385
- {
386
- "epoch": 0.4304635761589404,
387
- "grad_norm": 1.0043411254882812,
388
- "learning_rate": 9.942647045842095e-05,
389
- "loss": 0.5962,
390
- "step": 2600
391
- },
392
- {
393
- "epoch": 0.43874172185430466,
394
- "grad_norm": 1.0588668584823608,
395
- "learning_rate": 9.93514430145593e-05,
396
- "loss": 0.6067,
397
- "step": 2650
398
- },
399
- {
400
- "epoch": 0.4470198675496689,
401
- "grad_norm": 0.9364084601402283,
402
- "learning_rate": 9.927183694810862e-05,
403
- "loss": 0.5928,
404
- "step": 2700
405
- },
406
- {
407
- "epoch": 0.4552980132450331,
408
- "grad_norm": 1.155172348022461,
409
- "learning_rate": 9.918765964459022e-05,
410
- "loss": 0.5987,
411
- "step": 2750
412
- },
413
- {
414
- "epoch": 0.46357615894039733,
415
- "grad_norm": 1.1639224290847778,
416
- "learning_rate": 9.909891891362587e-05,
417
- "loss": 0.5745,
418
- "step": 2800
419
- },
420
- {
421
- "epoch": 0.4718543046357616,
422
- "grad_norm": 0.9658174514770508,
423
- "learning_rate": 9.900562298821323e-05,
424
- "loss": 0.5825,
425
- "step": 2850
426
- },
427
- {
428
- "epoch": 0.48013245033112584,
429
- "grad_norm": 1.118033766746521,
430
- "learning_rate": 9.890778052396205e-05,
431
- "loss": 0.5806,
432
- "step": 2900
433
- },
434
- {
435
- "epoch": 0.48841059602649006,
436
- "grad_norm": 0.9781912565231323,
437
- "learning_rate": 9.880540059829115e-05,
438
- "loss": 0.5712,
439
- "step": 2950
440
- },
441
- {
442
- "epoch": 0.4966887417218543,
443
- "grad_norm": 1.2145684957504272,
444
- "learning_rate": 9.869849270958622e-05,
445
- "loss": 0.5855,
446
- "step": 3000
447
- },
448
- {
449
- "epoch": 0.5049668874172185,
450
- "grad_norm": 0.999279260635376,
451
- "learning_rate": 9.858706677631862e-05,
452
- "loss": 0.5843,
453
- "step": 3050
454
- },
455
- {
456
- "epoch": 0.5132450331125827,
457
- "grad_norm": 1.098258137702942,
458
- "learning_rate": 9.847113313612517e-05,
459
- "loss": 0.5605,
460
- "step": 3100
461
- },
462
- {
463
- "epoch": 0.5215231788079471,
464
- "grad_norm": 0.627949059009552,
465
- "learning_rate": 9.835070254484912e-05,
466
- "loss": 0.5538,
467
- "step": 3150
468
- },
469
- {
470
- "epoch": 0.5298013245033113,
471
- "grad_norm": 1.0991902351379395,
472
- "learning_rate": 9.822578617554219e-05,
473
- "loss": 0.5555,
474
- "step": 3200
475
- },
476
- {
477
- "epoch": 0.5380794701986755,
478
- "grad_norm": 0.9670843482017517,
479
- "learning_rate": 9.8096395617428e-05,
480
- "loss": 0.5647,
481
- "step": 3250
482
- },
483
- {
484
- "epoch": 0.5463576158940397,
485
- "grad_norm": 0.9838133454322815,
486
- "learning_rate": 9.796254287482693e-05,
487
- "loss": 0.5561,
488
- "step": 3300
489
- },
490
- {
491
- "epoch": 0.554635761589404,
492
- "grad_norm": 1.1465744972229004,
493
- "learning_rate": 9.782424036604234e-05,
494
- "loss": 0.559,
495
- "step": 3350
496
- },
497
- {
498
- "epoch": 0.5629139072847682,
499
- "grad_norm": 1.1423758268356323,
500
- "learning_rate": 9.768150092220849e-05,
501
- "loss": 0.5517,
502
- "step": 3400
503
- },
504
- {
505
- "epoch": 0.5711920529801324,
506
- "grad_norm": 1.1365066766738892,
507
- "learning_rate": 9.753433778610008e-05,
508
- "loss": 0.5464,
509
- "step": 3450
510
- },
511
- {
512
- "epoch": 0.5794701986754967,
513
- "grad_norm": 0.81045001745224,
514
- "learning_rate": 9.738276461090371e-05,
515
- "loss": 0.5493,
516
- "step": 3500
517
- },
518
- {
519
- "epoch": 0.5877483443708609,
520
- "grad_norm": 1.0236687660217285,
521
- "learning_rate": 9.72267954589511e-05,
522
- "loss": 0.567,
523
- "step": 3550
524
- },
525
- {
526
- "epoch": 0.5960264900662252,
527
- "grad_norm": 0.9495602250099182,
528
- "learning_rate": 9.706644480041455e-05,
529
- "loss": 0.5474,
530
- "step": 3600
531
  }
532
  ],
533
  "logging_steps": 50,
@@ -547,7 +183,7 @@
547
  "attributes": {}
548
  }
549
  },
550
- "total_flos": 2.821155937006387e+16,
551
  "train_batch_size": 1,
552
  "trial_name": null,
553
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.16556291390728478,
6
  "eval_steps": 100,
7
+ "global_step": 1000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
164
  "learning_rate": 5.513245033112583e-05,
165
  "loss": 0.7213,
166
  "step": 1000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
  }
168
  ],
169
  "logging_steps": 50,
 
183
  "attributes": {}
184
  }
185
  },
186
+ "total_flos": 7833052747137024.0,
187
  "train_batch_size": 1,
188
  "trial_name": null,
189
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2543e07a37d2c3de3cd8e1d682eb10ddfc7a8cf84209a331e0b0e44870af81c3
3
  size 5752
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5dd2ef96eff028fc6db83c8627ce2e789cafe652a25ea367c040819bc392f916
3
  size 5752
trainer_log.jsonl CHANGED
@@ -75,3 +75,5 @@
75
  {"current_steps": 3500, "total_steps": 18120, "loss": 0.5493, "lr": 9.738276461090371e-05, "epoch": 0.5794701986754967, "percentage": 19.32, "elapsed_time": "2:41:32", "remaining_time": "11:14:47"}
76
  {"current_steps": 3550, "total_steps": 18120, "loss": 0.567, "lr": 9.72267954589511e-05, "epoch": 0.5877483443708609, "percentage": 19.59, "elapsed_time": "2:44:54", "remaining_time": "11:16:50"}
77
  {"current_steps": 3600, "total_steps": 18120, "loss": 0.5474, "lr": 9.706644480041455e-05, "epoch": 0.5960264900662252, "percentage": 19.87, "elapsed_time": "2:48:09", "remaining_time": "11:18:14"}
 
 
 
75
  {"current_steps": 3500, "total_steps": 18120, "loss": 0.5493, "lr": 9.738276461090371e-05, "epoch": 0.5794701986754967, "percentage": 19.32, "elapsed_time": "2:41:32", "remaining_time": "11:14:47"}
76
  {"current_steps": 3550, "total_steps": 18120, "loss": 0.567, "lr": 9.72267954589511e-05, "epoch": 0.5877483443708609, "percentage": 19.59, "elapsed_time": "2:44:54", "remaining_time": "11:16:50"}
77
  {"current_steps": 3600, "total_steps": 18120, "loss": 0.5474, "lr": 9.706644480041455e-05, "epoch": 0.5960264900662252, "percentage": 19.87, "elapsed_time": "2:48:09", "remaining_time": "11:18:14"}
78
+ {"current_steps": 3650, "total_steps": 18120, "loss": 0.5238, "lr": 9.690172751196437e-05, "epoch": 0.6043046357615894, "percentage": 20.14, "elapsed_time": "2:51:29", "remaining_time": "11:19:51"}
79
+ {"current_steps": 3700, "total_steps": 18120, "loss": 0.521, "lr": 9.67326588753887e-05, "epoch": 0.6125827814569537, "percentage": 20.42, "elapsed_time": "2:54:52", "remaining_time": "11:21:30"}