Training in progress, step 2425
Browse files
adapter_model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 108113968
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ab0efb69d2dd4ccd18add7c8d575a842a3d074ef54403c99c6db819af71c77a1
|
| 3 |
size 108113968
|
last-checkpoint/adapter_config.json
CHANGED
|
@@ -29,13 +29,13 @@
|
|
| 29 |
"rank_pattern": {},
|
| 30 |
"revision": null,
|
| 31 |
"target_modules": [
|
| 32 |
-
"
|
| 33 |
-
"v_proj",
|
| 34 |
"k_proj",
|
|
|
|
| 35 |
"o_proj",
|
| 36 |
-
"
|
| 37 |
-
"
|
| 38 |
-
"
|
| 39 |
],
|
| 40 |
"target_parameters": null,
|
| 41 |
"task_type": "CAUSAL_LM",
|
|
|
|
| 29 |
"rank_pattern": {},
|
| 30 |
"revision": null,
|
| 31 |
"target_modules": [
|
| 32 |
+
"down_proj",
|
|
|
|
| 33 |
"k_proj",
|
| 34 |
+
"up_proj",
|
| 35 |
"o_proj",
|
| 36 |
+
"gate_proj",
|
| 37 |
+
"v_proj",
|
| 38 |
+
"q_proj"
|
| 39 |
],
|
| 40 |
"target_parameters": null,
|
| 41 |
"task_type": "CAUSAL_LM",
|
last-checkpoint/adapter_model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 108113968
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:23cc74daf6c70e4089aff09333b0706b30bba28a0cf6991c49bb172b7614c70a
|
| 3 |
size 108113968
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 57081771
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:45e2021b79e47156a888c5a9b65619596c377b821c0a36373339d1a5a3dfdb5b
|
| 3 |
size 57081771
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bb8c24123d0a6abb40712c04ec45e32a580173995d543bee32e57aefd8bd098c
|
| 3 |
size 1465
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 300,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -498,476 +498,6 @@
|
|
| 498 |
"mean_token_accuracy": 0.9329368638992309,
|
| 499 |
"num_tokens": 122190.0,
|
| 500 |
"step": 1225
|
| 501 |
-
},
|
| 502 |
-
{
|
| 503 |
-
"entropy": 0.2075369682908058,
|
| 504 |
-
"epoch": 0.3582688449412439,
|
| 505 |
-
"grad_norm": 0.39205244183540344,
|
| 506 |
-
"learning_rate": 0.0002,
|
| 507 |
-
"loss": 0.2030281639099121,
|
| 508 |
-
"mean_token_accuracy": 0.9333784127235413,
|
| 509 |
-
"num_tokens": 40591.0,
|
| 510 |
-
"step": 1250
|
| 511 |
-
},
|
| 512 |
-
{
|
| 513 |
-
"entropy": 0.21607059925794603,
|
| 514 |
-
"epoch": 0.3654342218400688,
|
| 515 |
-
"grad_norm": 0.24999791383743286,
|
| 516 |
-
"learning_rate": 0.0002,
|
| 517 |
-
"loss": 0.21144355773925783,
|
| 518 |
-
"mean_token_accuracy": 0.93144282579422,
|
| 519 |
-
"num_tokens": 81873.0,
|
| 520 |
-
"step": 1275
|
| 521 |
-
},
|
| 522 |
-
{
|
| 523 |
-
"entropy": 0.2062842446565628,
|
| 524 |
-
"epoch": 0.37259959873889364,
|
| 525 |
-
"grad_norm": 0.36865198612213135,
|
| 526 |
-
"learning_rate": 0.0002,
|
| 527 |
-
"loss": 0.20512981414794923,
|
| 528 |
-
"mean_token_accuracy": 0.9319202589988709,
|
| 529 |
-
"num_tokens": 122819.0,
|
| 530 |
-
"step": 1300
|
| 531 |
-
},
|
| 532 |
-
{
|
| 533 |
-
"entropy": 0.20617701470851899,
|
| 534 |
-
"epoch": 0.37976497563771855,
|
| 535 |
-
"grad_norm": 0.2900436222553253,
|
| 536 |
-
"learning_rate": 0.0002,
|
| 537 |
-
"loss": 0.20203329086303712,
|
| 538 |
-
"mean_token_accuracy": 0.9338054943084717,
|
| 539 |
-
"num_tokens": 164426.0,
|
| 540 |
-
"step": 1325
|
| 541 |
-
},
|
| 542 |
-
{
|
| 543 |
-
"entropy": 0.20276340633630752,
|
| 544 |
-
"epoch": 0.3869303525365434,
|
| 545 |
-
"grad_norm": 0.3424394130706787,
|
| 546 |
-
"learning_rate": 0.0002,
|
| 547 |
-
"loss": 0.19647628784179688,
|
| 548 |
-
"mean_token_accuracy": 0.9352651834487915,
|
| 549 |
-
"num_tokens": 204693.0,
|
| 550 |
-
"step": 1350
|
| 551 |
-
},
|
| 552 |
-
{
|
| 553 |
-
"entropy": 0.19353608846664427,
|
| 554 |
-
"epoch": 0.3940957294353683,
|
| 555 |
-
"grad_norm": 0.2800115644931793,
|
| 556 |
-
"learning_rate": 0.0002,
|
| 557 |
-
"loss": 0.1909392547607422,
|
| 558 |
-
"mean_token_accuracy": 0.936229157447815,
|
| 559 |
-
"num_tokens": 245202.0,
|
| 560 |
-
"step": 1375
|
| 561 |
-
},
|
| 562 |
-
{
|
| 563 |
-
"entropy": 0.20765207797288895,
|
| 564 |
-
"epoch": 0.40126110633419315,
|
| 565 |
-
"grad_norm": 0.29286009073257446,
|
| 566 |
-
"learning_rate": 0.0002,
|
| 567 |
-
"loss": 0.20011020660400392,
|
| 568 |
-
"mean_token_accuracy": 0.9337928628921509,
|
| 569 |
-
"num_tokens": 286944.0,
|
| 570 |
-
"step": 1400
|
| 571 |
-
},
|
| 572 |
-
{
|
| 573 |
-
"entropy": 0.2191137021780014,
|
| 574 |
-
"epoch": 0.40842648323301806,
|
| 575 |
-
"grad_norm": 0.26620274782180786,
|
| 576 |
-
"learning_rate": 0.0002,
|
| 577 |
-
"loss": 0.21411985397338867,
|
| 578 |
-
"mean_token_accuracy": 0.9297138857841492,
|
| 579 |
-
"num_tokens": 329201.0,
|
| 580 |
-
"step": 1425
|
| 581 |
-
},
|
| 582 |
-
{
|
| 583 |
-
"entropy": 0.21264588654041292,
|
| 584 |
-
"epoch": 0.41559186013184296,
|
| 585 |
-
"grad_norm": 0.38385578989982605,
|
| 586 |
-
"learning_rate": 0.0002,
|
| 587 |
-
"loss": 0.208143367767334,
|
| 588 |
-
"mean_token_accuracy": 0.9316162085533142,
|
| 589 |
-
"num_tokens": 371891.0,
|
| 590 |
-
"step": 1450
|
| 591 |
-
},
|
| 592 |
-
{
|
| 593 |
-
"entropy": 0.2007578819990158,
|
| 594 |
-
"epoch": 0.4227572370306678,
|
| 595 |
-
"grad_norm": 0.3052174746990204,
|
| 596 |
-
"learning_rate": 0.0002,
|
| 597 |
-
"loss": 0.19854948043823242,
|
| 598 |
-
"mean_token_accuracy": 0.9345853734016418,
|
| 599 |
-
"num_tokens": 412534.0,
|
| 600 |
-
"step": 1475
|
| 601 |
-
},
|
| 602 |
-
{
|
| 603 |
-
"entropy": 0.20256735682487487,
|
| 604 |
-
"epoch": 0.4299226139294927,
|
| 605 |
-
"grad_norm": 0.2761523723602295,
|
| 606 |
-
"learning_rate": 0.0002,
|
| 607 |
-
"loss": 0.19539085388183594,
|
| 608 |
-
"mean_token_accuracy": 0.9367341923713685,
|
| 609 |
-
"num_tokens": 452855.0,
|
| 610 |
-
"step": 1500
|
| 611 |
-
},
|
| 612 |
-
{
|
| 613 |
-
"entropy": 0.19164222806692124,
|
| 614 |
-
"epoch": 0.43708799082831756,
|
| 615 |
-
"grad_norm": 0.3495299220085144,
|
| 616 |
-
"learning_rate": 0.0002,
|
| 617 |
-
"loss": 0.1890553665161133,
|
| 618 |
-
"mean_token_accuracy": 0.9373278784751892,
|
| 619 |
-
"num_tokens": 493202.0,
|
| 620 |
-
"step": 1525
|
| 621 |
-
},
|
| 622 |
-
{
|
| 623 |
-
"entropy": 0.20341520249843598,
|
| 624 |
-
"epoch": 0.44425336772714247,
|
| 625 |
-
"grad_norm": 0.3206697702407837,
|
| 626 |
-
"learning_rate": 0.0002,
|
| 627 |
-
"loss": 0.20173826217651367,
|
| 628 |
-
"mean_token_accuracy": 0.9332520008087158,
|
| 629 |
-
"num_tokens": 534946.0,
|
| 630 |
-
"step": 1550
|
| 631 |
-
},
|
| 632 |
-
{
|
| 633 |
-
"entropy": 0.20512860178947448,
|
| 634 |
-
"epoch": 0.4514187446259673,
|
| 635 |
-
"grad_norm": 0.369289755821228,
|
| 636 |
-
"learning_rate": 0.0002,
|
| 637 |
-
"loss": 0.1998735237121582,
|
| 638 |
-
"mean_token_accuracy": 0.9336103320121765,
|
| 639 |
-
"num_tokens": 576115.0,
|
| 640 |
-
"step": 1575
|
| 641 |
-
},
|
| 642 |
-
{
|
| 643 |
-
"entropy": 0.19730552673339843,
|
| 644 |
-
"epoch": 0.4585841215247922,
|
| 645 |
-
"grad_norm": 0.1693185716867447,
|
| 646 |
-
"learning_rate": 0.0002,
|
| 647 |
-
"loss": 0.19725181579589843,
|
| 648 |
-
"mean_token_accuracy": 0.9343607997894288,
|
| 649 |
-
"num_tokens": 616923.0,
|
| 650 |
-
"step": 1600
|
| 651 |
-
},
|
| 652 |
-
{
|
| 653 |
-
"entropy": 0.20145605146884918,
|
| 654 |
-
"epoch": 0.46574949842361707,
|
| 655 |
-
"grad_norm": 0.34067076444625854,
|
| 656 |
-
"learning_rate": 0.0002,
|
| 657 |
-
"loss": 0.19863763809204102,
|
| 658 |
-
"mean_token_accuracy": 0.9344722628593445,
|
| 659 |
-
"num_tokens": 658021.0,
|
| 660 |
-
"step": 1625
|
| 661 |
-
},
|
| 662 |
-
{
|
| 663 |
-
"entropy": 0.19174030989408494,
|
| 664 |
-
"epoch": 0.472914875322442,
|
| 665 |
-
"grad_norm": 0.282787024974823,
|
| 666 |
-
"learning_rate": 0.0002,
|
| 667 |
-
"loss": 0.18856592178344728,
|
| 668 |
-
"mean_token_accuracy": 0.9382144474983215,
|
| 669 |
-
"num_tokens": 698701.0,
|
| 670 |
-
"step": 1650
|
| 671 |
-
},
|
| 672 |
-
{
|
| 673 |
-
"entropy": 0.19893687069416047,
|
| 674 |
-
"epoch": 0.4800802522212668,
|
| 675 |
-
"grad_norm": 0.21854329109191895,
|
| 676 |
-
"learning_rate": 0.0002,
|
| 677 |
-
"loss": 0.19327503204345703,
|
| 678 |
-
"mean_token_accuracy": 0.9353450679779053,
|
| 679 |
-
"num_tokens": 739913.0,
|
| 680 |
-
"step": 1675
|
| 681 |
-
},
|
| 682 |
-
{
|
| 683 |
-
"entropy": 0.19346537590026855,
|
| 684 |
-
"epoch": 0.48724562912009173,
|
| 685 |
-
"grad_norm": 0.19436436891555786,
|
| 686 |
-
"learning_rate": 0.0002,
|
| 687 |
-
"loss": 0.19321285247802733,
|
| 688 |
-
"mean_token_accuracy": 0.9373372173309327,
|
| 689 |
-
"num_tokens": 780719.0,
|
| 690 |
-
"step": 1700
|
| 691 |
-
},
|
| 692 |
-
{
|
| 693 |
-
"entropy": 0.20528113186359406,
|
| 694 |
-
"epoch": 0.4944110060189166,
|
| 695 |
-
"grad_norm": 0.31415456533432007,
|
| 696 |
-
"learning_rate": 0.0002,
|
| 697 |
-
"loss": 0.2044132423400879,
|
| 698 |
-
"mean_token_accuracy": 0.9320711612701416,
|
| 699 |
-
"num_tokens": 822130.0,
|
| 700 |
-
"step": 1725
|
| 701 |
-
},
|
| 702 |
-
{
|
| 703 |
-
"entropy": 0.20051146537065506,
|
| 704 |
-
"epoch": 0.5015763829177414,
|
| 705 |
-
"grad_norm": 0.36767083406448364,
|
| 706 |
-
"learning_rate": 0.0002,
|
| 707 |
-
"loss": 0.19968202590942383,
|
| 708 |
-
"mean_token_accuracy": 0.9361233901977539,
|
| 709 |
-
"num_tokens": 863055.0,
|
| 710 |
-
"step": 1750
|
| 711 |
-
},
|
| 712 |
-
{
|
| 713 |
-
"entropy": 0.19146274596452714,
|
| 714 |
-
"epoch": 0.5087417598165663,
|
| 715 |
-
"grad_norm": 0.36641210317611694,
|
| 716 |
-
"learning_rate": 0.0002,
|
| 717 |
-
"loss": 0.1849520492553711,
|
| 718 |
-
"mean_token_accuracy": 0.9378811025619507,
|
| 719 |
-
"num_tokens": 903979.0,
|
| 720 |
-
"step": 1775
|
| 721 |
-
},
|
| 722 |
-
{
|
| 723 |
-
"entropy": 0.20497863948345185,
|
| 724 |
-
"epoch": 0.5159071367153912,
|
| 725 |
-
"grad_norm": 0.41181716322898865,
|
| 726 |
-
"learning_rate": 0.0002,
|
| 727 |
-
"loss": 0.2043849754333496,
|
| 728 |
-
"mean_token_accuracy": 0.9320770597457886,
|
| 729 |
-
"num_tokens": 945010.0,
|
| 730 |
-
"step": 1800
|
| 731 |
-
},
|
| 732 |
-
{
|
| 733 |
-
"entropy": 0.19871506720781326,
|
| 734 |
-
"epoch": 0.5230725136142161,
|
| 735 |
-
"grad_norm": 0.34865760803222656,
|
| 736 |
-
"learning_rate": 0.0002,
|
| 737 |
-
"loss": 0.19058765411376954,
|
| 738 |
-
"mean_token_accuracy": 0.936968915462494,
|
| 739 |
-
"num_tokens": 985351.0,
|
| 740 |
-
"step": 1825
|
| 741 |
-
},
|
| 742 |
-
{
|
| 743 |
-
"entropy": 0.21031922459602356,
|
| 744 |
-
"epoch": 0.5302378905130409,
|
| 745 |
-
"grad_norm": 0.35983604192733765,
|
| 746 |
-
"learning_rate": 0.0002,
|
| 747 |
-
"loss": 0.20398990631103517,
|
| 748 |
-
"mean_token_accuracy": 0.9338763618469238,
|
| 749 |
-
"num_tokens": 1027146.0,
|
| 750 |
-
"step": 1850
|
| 751 |
-
},
|
| 752 |
-
{
|
| 753 |
-
"entropy": 0.20145108669996262,
|
| 754 |
-
"epoch": 0.5374032674118658,
|
| 755 |
-
"grad_norm": 0.2126716524362564,
|
| 756 |
-
"learning_rate": 0.0002,
|
| 757 |
-
"loss": 0.19558551788330078,
|
| 758 |
-
"mean_token_accuracy": 0.9350816106796265,
|
| 759 |
-
"num_tokens": 1068454.0,
|
| 760 |
-
"step": 1875
|
| 761 |
-
},
|
| 762 |
-
{
|
| 763 |
-
"entropy": 0.19600239813327788,
|
| 764 |
-
"epoch": 0.5445686443106907,
|
| 765 |
-
"grad_norm": 0.2547587752342224,
|
| 766 |
-
"learning_rate": 0.0002,
|
| 767 |
-
"loss": 0.18890924453735353,
|
| 768 |
-
"mean_token_accuracy": 0.9360025477409363,
|
| 769 |
-
"num_tokens": 1109230.0,
|
| 770 |
-
"step": 1900
|
| 771 |
-
},
|
| 772 |
-
{
|
| 773 |
-
"entropy": 0.17782112330198288,
|
| 774 |
-
"epoch": 0.5517340212095156,
|
| 775 |
-
"grad_norm": 0.28866520524024963,
|
| 776 |
-
"learning_rate": 0.0002,
|
| 777 |
-
"loss": 0.17644382476806642,
|
| 778 |
-
"mean_token_accuracy": 0.9422430300712585,
|
| 779 |
-
"num_tokens": 1148978.0,
|
| 780 |
-
"step": 1925
|
| 781 |
-
},
|
| 782 |
-
{
|
| 783 |
-
"entropy": 0.18634845435619354,
|
| 784 |
-
"epoch": 0.5588993981083406,
|
| 785 |
-
"grad_norm": 0.2348451316356659,
|
| 786 |
-
"learning_rate": 0.0002,
|
| 787 |
-
"loss": 0.1815641212463379,
|
| 788 |
-
"mean_token_accuracy": 0.9392524695396424,
|
| 789 |
-
"num_tokens": 1189196.0,
|
| 790 |
-
"step": 1950
|
| 791 |
-
},
|
| 792 |
-
{
|
| 793 |
-
"entropy": 0.18852397054433823,
|
| 794 |
-
"epoch": 0.5660647750071653,
|
| 795 |
-
"grad_norm": 0.25562164187431335,
|
| 796 |
-
"learning_rate": 0.0002,
|
| 797 |
-
"loss": 0.18350950241088868,
|
| 798 |
-
"mean_token_accuracy": 0.9394415140151977,
|
| 799 |
-
"num_tokens": 1229072.0,
|
| 800 |
-
"step": 1975
|
| 801 |
-
},
|
| 802 |
-
{
|
| 803 |
-
"entropy": 0.18256970256567,
|
| 804 |
-
"epoch": 0.5732301519059902,
|
| 805 |
-
"grad_norm": 0.36442917585372925,
|
| 806 |
-
"learning_rate": 0.0002,
|
| 807 |
-
"loss": 0.18093914031982422,
|
| 808 |
-
"mean_token_accuracy": 0.9397966265678406,
|
| 809 |
-
"num_tokens": 1269371.0,
|
| 810 |
-
"step": 2000
|
| 811 |
-
},
|
| 812 |
-
{
|
| 813 |
-
"entropy": 0.20554341971874238,
|
| 814 |
-
"epoch": 0.5803955288048152,
|
| 815 |
-
"grad_norm": 0.3102213442325592,
|
| 816 |
-
"learning_rate": 0.0002,
|
| 817 |
-
"loss": 0.2052627372741699,
|
| 818 |
-
"mean_token_accuracy": 0.9325902485847473,
|
| 819 |
-
"num_tokens": 1311354.0,
|
| 820 |
-
"step": 2025
|
| 821 |
-
},
|
| 822 |
-
{
|
| 823 |
-
"entropy": 0.2037496653199196,
|
| 824 |
-
"epoch": 0.5875609057036401,
|
| 825 |
-
"grad_norm": 0.24330857396125793,
|
| 826 |
-
"learning_rate": 0.0002,
|
| 827 |
-
"loss": 0.20022052764892578,
|
| 828 |
-
"mean_token_accuracy": 0.9342735767364502,
|
| 829 |
-
"num_tokens": 1353051.0,
|
| 830 |
-
"step": 2050
|
| 831 |
-
},
|
| 832 |
-
{
|
| 833 |
-
"entropy": 0.19858424603939057,
|
| 834 |
-
"epoch": 0.5947262826024649,
|
| 835 |
-
"grad_norm": 0.2955344021320343,
|
| 836 |
-
"learning_rate": 0.0002,
|
| 837 |
-
"loss": 0.19497306823730468,
|
| 838 |
-
"mean_token_accuracy": 0.9353799057006836,
|
| 839 |
-
"num_tokens": 1394712.0,
|
| 840 |
-
"step": 2075
|
| 841 |
-
},
|
| 842 |
-
{
|
| 843 |
-
"entropy": 0.20194011509418489,
|
| 844 |
-
"epoch": 0.6018916595012898,
|
| 845 |
-
"grad_norm": 0.20898522436618805,
|
| 846 |
-
"learning_rate": 0.0002,
|
| 847 |
-
"loss": 0.19739873886108397,
|
| 848 |
-
"mean_token_accuracy": 0.9346665263175964,
|
| 849 |
-
"num_tokens": 1436282.0,
|
| 850 |
-
"step": 2100
|
| 851 |
-
},
|
| 852 |
-
{
|
| 853 |
-
"entropy": 0.18827197730541229,
|
| 854 |
-
"epoch": 0.6090570364001147,
|
| 855 |
-
"grad_norm": 0.3064703643321991,
|
| 856 |
-
"learning_rate": 0.0002,
|
| 857 |
-
"loss": 0.1838802719116211,
|
| 858 |
-
"mean_token_accuracy": 0.939849009513855,
|
| 859 |
-
"num_tokens": 1476569.0,
|
| 860 |
-
"step": 2125
|
| 861 |
-
},
|
| 862 |
-
{
|
| 863 |
-
"entropy": 0.20322401821613312,
|
| 864 |
-
"epoch": 0.6162224132989396,
|
| 865 |
-
"grad_norm": 0.42201489210128784,
|
| 866 |
-
"learning_rate": 0.0002,
|
| 867 |
-
"loss": 0.20033023834228517,
|
| 868 |
-
"mean_token_accuracy": 0.9345659923553467,
|
| 869 |
-
"num_tokens": 1518315.0,
|
| 870 |
-
"step": 2150
|
| 871 |
-
},
|
| 872 |
-
{
|
| 873 |
-
"entropy": 0.1822732812166214,
|
| 874 |
-
"epoch": 0.6233877901977644,
|
| 875 |
-
"grad_norm": 0.2799566388130188,
|
| 876 |
-
"learning_rate": 0.0002,
|
| 877 |
-
"loss": 0.18143136978149413,
|
| 878 |
-
"mean_token_accuracy": 0.9404231834411622,
|
| 879 |
-
"num_tokens": 1558340.0,
|
| 880 |
-
"step": 2175
|
| 881 |
-
},
|
| 882 |
-
{
|
| 883 |
-
"entropy": 0.19505684196949005,
|
| 884 |
-
"epoch": 0.6305531670965893,
|
| 885 |
-
"grad_norm": 0.20578612387180328,
|
| 886 |
-
"learning_rate": 0.0002,
|
| 887 |
-
"loss": 0.18889547348022462,
|
| 888 |
-
"mean_token_accuracy": 0.9381808185577393,
|
| 889 |
-
"num_tokens": 1599592.0,
|
| 890 |
-
"step": 2200
|
| 891 |
-
},
|
| 892 |
-
{
|
| 893 |
-
"entropy": 0.19981920778751372,
|
| 894 |
-
"epoch": 0.6377185439954142,
|
| 895 |
-
"grad_norm": 0.28131991624832153,
|
| 896 |
-
"learning_rate": 0.0002,
|
| 897 |
-
"loss": 0.19793636322021485,
|
| 898 |
-
"mean_token_accuracy": 0.935631537437439,
|
| 899 |
-
"num_tokens": 1641401.0,
|
| 900 |
-
"step": 2225
|
| 901 |
-
},
|
| 902 |
-
{
|
| 903 |
-
"entropy": 0.19168403446674348,
|
| 904 |
-
"epoch": 0.6448839208942391,
|
| 905 |
-
"grad_norm": 0.25856539607048035,
|
| 906 |
-
"learning_rate": 0.0002,
|
| 907 |
-
"loss": 0.1897783088684082,
|
| 908 |
-
"mean_token_accuracy": 0.9356019353866577,
|
| 909 |
-
"num_tokens": 1682949.0,
|
| 910 |
-
"step": 2250
|
| 911 |
-
},
|
| 912 |
-
{
|
| 913 |
-
"entropy": 0.1931222453713417,
|
| 914 |
-
"epoch": 0.6520492977930639,
|
| 915 |
-
"grad_norm": 0.4090195596218109,
|
| 916 |
-
"learning_rate": 0.0002,
|
| 917 |
-
"loss": 0.1929492950439453,
|
| 918 |
-
"mean_token_accuracy": 0.9369300937652588,
|
| 919 |
-
"num_tokens": 1724557.0,
|
| 920 |
-
"step": 2275
|
| 921 |
-
},
|
| 922 |
-
{
|
| 923 |
-
"entropy": 0.19567115902900695,
|
| 924 |
-
"epoch": 0.6592146746918888,
|
| 925 |
-
"grad_norm": 0.19224579632282257,
|
| 926 |
-
"learning_rate": 0.0002,
|
| 927 |
-
"loss": 0.19031965255737304,
|
| 928 |
-
"mean_token_accuracy": 0.9367053961753845,
|
| 929 |
-
"num_tokens": 1765618.0,
|
| 930 |
-
"step": 2300
|
| 931 |
-
},
|
| 932 |
-
{
|
| 933 |
-
"entropy": 0.18622912466526031,
|
| 934 |
-
"epoch": 0.6663800515907137,
|
| 935 |
-
"grad_norm": 0.27013909816741943,
|
| 936 |
-
"learning_rate": 0.0002,
|
| 937 |
-
"loss": 0.18465063095092774,
|
| 938 |
-
"mean_token_accuracy": 0.9383154940605164,
|
| 939 |
-
"num_tokens": 1806491.0,
|
| 940 |
-
"step": 2325
|
| 941 |
-
},
|
| 942 |
-
{
|
| 943 |
-
"entropy": 0.19851551949977875,
|
| 944 |
-
"epoch": 0.6735454284895386,
|
| 945 |
-
"grad_norm": 0.3999996483325958,
|
| 946 |
-
"learning_rate": 0.0002,
|
| 947 |
-
"loss": 0.19640205383300782,
|
| 948 |
-
"mean_token_accuracy": 0.9344918823242188,
|
| 949 |
-
"num_tokens": 1848741.0,
|
| 950 |
-
"step": 2350
|
| 951 |
-
},
|
| 952 |
-
{
|
| 953 |
-
"entropy": 0.18972006916999817,
|
| 954 |
-
"epoch": 0.6807108053883635,
|
| 955 |
-
"grad_norm": 0.26580268144607544,
|
| 956 |
-
"learning_rate": 0.0002,
|
| 957 |
-
"loss": 0.1871095657348633,
|
| 958 |
-
"mean_token_accuracy": 0.9390228629112244,
|
| 959 |
-
"num_tokens": 1890071.0,
|
| 960 |
-
"step": 2375
|
| 961 |
-
},
|
| 962 |
-
{
|
| 963 |
-
"entropy": 0.19580536246299743,
|
| 964 |
-
"epoch": 0.6878761822871883,
|
| 965 |
-
"grad_norm": 0.2682396471500397,
|
| 966 |
-
"learning_rate": 0.0002,
|
| 967 |
-
"loss": 0.19406461715698242,
|
| 968 |
-
"mean_token_accuracy": 0.9354706478118896,
|
| 969 |
-
"num_tokens": 1931751.0,
|
| 970 |
-
"step": 2400
|
| 971 |
}
|
| 972 |
],
|
| 973 |
"logging_steps": 25,
|
|
@@ -987,7 +517,7 @@
|
|
| 987 |
"attributes": {}
|
| 988 |
}
|
| 989 |
},
|
| 990 |
-
"total_flos":
|
| 991 |
"train_batch_size": 4,
|
| 992 |
"trial_name": null,
|
| 993 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.35110346804241904,
|
| 6 |
"eval_steps": 300,
|
| 7 |
+
"global_step": 1225,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 498 |
"mean_token_accuracy": 0.9329368638992309,
|
| 499 |
"num_tokens": 122190.0,
|
| 500 |
"step": 1225
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 501 |
}
|
| 502 |
],
|
| 503 |
"logging_steps": 25,
|
|
|
|
| 517 |
"attributes": {}
|
| 518 |
}
|
| 519 |
},
|
| 520 |
+
"total_flos": 1.1890534404816077e+17,
|
| 521 |
"train_batch_size": 4,
|
| 522 |
"trial_name": null,
|
| 523 |
"trial_params": null
|