Markus commited on
Commit
4ae434f
·
1 Parent(s): bb6735d

dev v1 run epoch 2

Browse files
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:921adda863e0351928e7ea50247b82ca865b3621936767ceeb9a80fd357ac393
3
  size 167832688
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d4cbada59623194f6c69a68d8c36c90528947cf9952ad098a827be4df7f954a9
3
  size 167832688
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0eac29e8b18904353a37433106e598c61b5f7df1d385d625677e3c6d4687005e
3
  size 335928722
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90465c9ea7d82334c5b100de3426a8eb9e8f7c982bcf019dbaefa8bcb8009641
3
  size 335928722
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d65074ec4acc545701030f5ea4ceadb1b1f0dcdfcf93b5c3b3b245a40ec009f6
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bed885709ed3a9466ff15e38eedc83597e774b6cceb79f6cd7499cf5ffa6725d
3
  size 14244
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:89863f3912f019669cfea60f1797f53313f2c8d5bf8721f6e9ee094e6cd17f6d
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de5a38ab50c47292a1483af688075150937969dc0e638274974d85780ed606dc
3
  size 1064
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.9985553308292401,
5
  "eval_steps": 500,
6
- "global_step": 432,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -316,6 +316,315 @@
316
  "eval_samples_per_second": 0.984,
317
  "eval_steps_per_second": 0.984,
318
  "step": 432
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
319
  }
320
  ],
321
  "logging_steps": 10,
@@ -323,7 +632,7 @@
323
  "num_input_tokens_seen": 0,
324
  "num_train_epochs": 2,
325
  "save_steps": 500,
326
- "total_flos": 3.059751851843912e+17,
327
  "train_batch_size": 1,
328
  "trial_name": null,
329
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.9971106616584802,
5
  "eval_steps": 500,
6
+ "global_step": 864,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
316
  "eval_samples_per_second": 0.984,
317
  "eval_steps_per_second": 0.984,
318
  "step": 432
319
+ },
320
+ {
321
+ "epoch": 1.02,
322
+ "grad_norm": 0.259765625,
323
+ "learning_rate": 0.00012,
324
+ "loss": 1.5637,
325
+ "step": 440
326
+ },
327
+ {
328
+ "epoch": 1.04,
329
+ "grad_norm": 0.2734375,
330
+ "learning_rate": 0.00012,
331
+ "loss": 1.5142,
332
+ "step": 450
333
+ },
334
+ {
335
+ "epoch": 1.06,
336
+ "grad_norm": 0.294921875,
337
+ "learning_rate": 0.00012,
338
+ "loss": 1.533,
339
+ "step": 460
340
+ },
341
+ {
342
+ "epoch": 1.09,
343
+ "grad_norm": 0.296875,
344
+ "learning_rate": 0.00012,
345
+ "loss": 1.5256,
346
+ "step": 470
347
+ },
348
+ {
349
+ "epoch": 1.11,
350
+ "grad_norm": 0.318359375,
351
+ "learning_rate": 0.00012,
352
+ "loss": 1.5749,
353
+ "step": 480
354
+ },
355
+ {
356
+ "epoch": 1.13,
357
+ "grad_norm": 0.302734375,
358
+ "learning_rate": 0.00012,
359
+ "loss": 1.5535,
360
+ "step": 490
361
+ },
362
+ {
363
+ "epoch": 1.16,
364
+ "grad_norm": 0.3359375,
365
+ "learning_rate": 0.00012,
366
+ "loss": 1.5754,
367
+ "step": 500
368
+ },
369
+ {
370
+ "epoch": 1.18,
371
+ "grad_norm": 0.302734375,
372
+ "learning_rate": 0.00012,
373
+ "loss": 1.52,
374
+ "step": 510
375
+ },
376
+ {
377
+ "epoch": 1.2,
378
+ "grad_norm": 0.31640625,
379
+ "learning_rate": 0.00012,
380
+ "loss": 1.5559,
381
+ "step": 520
382
+ },
383
+ {
384
+ "epoch": 1.23,
385
+ "grad_norm": 0.32421875,
386
+ "learning_rate": 0.00012,
387
+ "loss": 1.5231,
388
+ "step": 530
389
+ },
390
+ {
391
+ "epoch": 1.25,
392
+ "grad_norm": 0.3203125,
393
+ "learning_rate": 0.00012,
394
+ "loss": 1.5671,
395
+ "step": 540
396
+ },
397
+ {
398
+ "epoch": 1.27,
399
+ "grad_norm": 0.318359375,
400
+ "learning_rate": 0.00012,
401
+ "loss": 1.5529,
402
+ "step": 550
403
+ },
404
+ {
405
+ "epoch": 1.29,
406
+ "grad_norm": 0.35546875,
407
+ "learning_rate": 0.00012,
408
+ "loss": 1.5415,
409
+ "step": 560
410
+ },
411
+ {
412
+ "epoch": 1.32,
413
+ "grad_norm": 0.33203125,
414
+ "learning_rate": 0.00012,
415
+ "loss": 1.5511,
416
+ "step": 570
417
+ },
418
+ {
419
+ "epoch": 1.34,
420
+ "grad_norm": 0.3359375,
421
+ "learning_rate": 0.00012,
422
+ "loss": 1.5398,
423
+ "step": 580
424
+ },
425
+ {
426
+ "epoch": 1.36,
427
+ "grad_norm": 0.34375,
428
+ "learning_rate": 0.00012,
429
+ "loss": 1.5426,
430
+ "step": 590
431
+ },
432
+ {
433
+ "epoch": 1.39,
434
+ "grad_norm": 0.35546875,
435
+ "learning_rate": 0.00012,
436
+ "loss": 1.5334,
437
+ "step": 600
438
+ },
439
+ {
440
+ "epoch": 1.41,
441
+ "grad_norm": 0.330078125,
442
+ "learning_rate": 0.00012,
443
+ "loss": 1.5096,
444
+ "step": 610
445
+ },
446
+ {
447
+ "epoch": 1.43,
448
+ "grad_norm": 0.333984375,
449
+ "learning_rate": 0.00012,
450
+ "loss": 1.5416,
451
+ "step": 620
452
+ },
453
+ {
454
+ "epoch": 1.46,
455
+ "grad_norm": 0.376953125,
456
+ "learning_rate": 0.00012,
457
+ "loss": 1.5343,
458
+ "step": 630
459
+ },
460
+ {
461
+ "epoch": 1.48,
462
+ "grad_norm": 0.3359375,
463
+ "learning_rate": 0.00012,
464
+ "loss": 1.5416,
465
+ "step": 640
466
+ },
467
+ {
468
+ "epoch": 1.5,
469
+ "grad_norm": 0.33984375,
470
+ "learning_rate": 0.00012,
471
+ "loss": 1.5444,
472
+ "step": 650
473
+ },
474
+ {
475
+ "epoch": 1.53,
476
+ "grad_norm": 0.35546875,
477
+ "learning_rate": 0.00012,
478
+ "loss": 1.5112,
479
+ "step": 660
480
+ },
481
+ {
482
+ "epoch": 1.55,
483
+ "grad_norm": 0.35546875,
484
+ "learning_rate": 0.00012,
485
+ "loss": 1.5403,
486
+ "step": 670
487
+ },
488
+ {
489
+ "epoch": 1.57,
490
+ "grad_norm": 0.337890625,
491
+ "learning_rate": 0.00012,
492
+ "loss": 1.532,
493
+ "step": 680
494
+ },
495
+ {
496
+ "epoch": 1.59,
497
+ "grad_norm": 0.345703125,
498
+ "learning_rate": 0.00012,
499
+ "loss": 1.5451,
500
+ "step": 690
501
+ },
502
+ {
503
+ "epoch": 1.62,
504
+ "grad_norm": 0.337890625,
505
+ "learning_rate": 0.00012,
506
+ "loss": 1.5487,
507
+ "step": 700
508
+ },
509
+ {
510
+ "epoch": 1.64,
511
+ "grad_norm": 0.376953125,
512
+ "learning_rate": 0.00012,
513
+ "loss": 1.5529,
514
+ "step": 710
515
+ },
516
+ {
517
+ "epoch": 1.66,
518
+ "grad_norm": 0.3515625,
519
+ "learning_rate": 0.00012,
520
+ "loss": 1.5351,
521
+ "step": 720
522
+ },
523
+ {
524
+ "epoch": 1.69,
525
+ "grad_norm": 0.33984375,
526
+ "learning_rate": 0.00012,
527
+ "loss": 1.5045,
528
+ "step": 730
529
+ },
530
+ {
531
+ "epoch": 1.71,
532
+ "grad_norm": 0.33984375,
533
+ "learning_rate": 0.00012,
534
+ "loss": 1.5431,
535
+ "step": 740
536
+ },
537
+ {
538
+ "epoch": 1.73,
539
+ "grad_norm": 0.353515625,
540
+ "learning_rate": 0.00012,
541
+ "loss": 1.5456,
542
+ "step": 750
543
+ },
544
+ {
545
+ "epoch": 1.76,
546
+ "grad_norm": 0.353515625,
547
+ "learning_rate": 0.00012,
548
+ "loss": 1.537,
549
+ "step": 760
550
+ },
551
+ {
552
+ "epoch": 1.78,
553
+ "grad_norm": 0.3671875,
554
+ "learning_rate": 0.00012,
555
+ "loss": 1.5375,
556
+ "step": 770
557
+ },
558
+ {
559
+ "epoch": 1.8,
560
+ "grad_norm": 0.345703125,
561
+ "learning_rate": 0.00012,
562
+ "loss": 1.5305,
563
+ "step": 780
564
+ },
565
+ {
566
+ "epoch": 1.83,
567
+ "grad_norm": 0.369140625,
568
+ "learning_rate": 0.00012,
569
+ "loss": 1.5455,
570
+ "step": 790
571
+ },
572
+ {
573
+ "epoch": 1.85,
574
+ "grad_norm": 0.37109375,
575
+ "learning_rate": 0.00012,
576
+ "loss": 1.5163,
577
+ "step": 800
578
+ },
579
+ {
580
+ "epoch": 1.87,
581
+ "grad_norm": 0.349609375,
582
+ "learning_rate": 0.00012,
583
+ "loss": 1.5578,
584
+ "step": 810
585
+ },
586
+ {
587
+ "epoch": 1.9,
588
+ "grad_norm": 0.3515625,
589
+ "learning_rate": 0.00012,
590
+ "loss": 1.5191,
591
+ "step": 820
592
+ },
593
+ {
594
+ "epoch": 1.92,
595
+ "grad_norm": 0.357421875,
596
+ "learning_rate": 0.00012,
597
+ "loss": 1.5257,
598
+ "step": 830
599
+ },
600
+ {
601
+ "epoch": 1.94,
602
+ "grad_norm": 0.35546875,
603
+ "learning_rate": 0.00012,
604
+ "loss": 1.527,
605
+ "step": 840
606
+ },
607
+ {
608
+ "epoch": 1.96,
609
+ "grad_norm": 0.361328125,
610
+ "learning_rate": 0.00012,
611
+ "loss": 1.51,
612
+ "step": 850
613
+ },
614
+ {
615
+ "epoch": 1.99,
616
+ "grad_norm": 0.359375,
617
+ "learning_rate": 0.00012,
618
+ "loss": 1.5232,
619
+ "step": 860
620
+ },
621
+ {
622
+ "epoch": 2.0,
623
+ "eval_loss": 1.692084550857544,
624
+ "eval_runtime": 905.0512,
625
+ "eval_samples_per_second": 0.984,
626
+ "eval_steps_per_second": 0.984,
627
+ "step": 864
628
  }
629
  ],
630
  "logging_steps": 10,
 
632
  "num_input_tokens_seen": 0,
633
  "num_train_epochs": 2,
634
  "save_steps": 500,
635
+ "total_flos": 6.110663045346755e+17,
636
  "train_batch_size": 1,
637
  "trial_name": null,
638
  "trial_params": null