Training in progress, step 2700, checkpoint
Browse files
last-checkpoint/adapter_model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 228140600
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:df0b3c057589426de11702e8aa51f40578fbdc1c16b5298b4df1b3741a358543
|
| 3 |
size 228140600
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 117931203
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7e1a2a35f3f40624f11f416233f78a070b1dea29da95a3a90a9a787a9173de3d
|
| 3 |
size 117931203
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14645
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:54ee403e6e7f52e165fb91ab2843ca4f38ca3d3c64d81b59c5a39f9e4c098413
|
| 3 |
size 14645
|
last-checkpoint/scaler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1383
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:88be0f049d620e88b111c309644f5ca8c552ca0e64dbf5a41f67ac4dd14016eb
|
| 3 |
size 1383
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6abcf0c15a7ba90c608cb1903d96b4ad18eb9806fb694a46be4e23a52b64410b
|
| 3 |
size 1465
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": 750,
|
| 3 |
"best_metric": 0.5089643597602844,
|
| 4 |
"best_model_checkpoint": "./adapter-phase1/checkpoint-750",
|
| 5 |
-
"epoch":
|
| 6 |
"eval_steps": 300,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -2564,6 +2564,318 @@
|
|
| 2564 |
"eval_samples_per_second": 2.301,
|
| 2565 |
"eval_steps_per_second": 0.575,
|
| 2566 |
"step": 2400
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2567 |
}
|
| 2568 |
],
|
| 2569 |
"logging_steps": 10,
|
|
@@ -2583,7 +2895,7 @@
|
|
| 2583 |
"attributes": {}
|
| 2584 |
}
|
| 2585 |
},
|
| 2586 |
-
"total_flos": 4.
|
| 2587 |
"train_batch_size": 1,
|
| 2588 |
"trial_name": null,
|
| 2589 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": 750,
|
| 3 |
"best_metric": 0.5089643597602844,
|
| 4 |
"best_model_checkpoint": "./adapter-phase1/checkpoint-750",
|
| 5 |
+
"epoch": 4.32,
|
| 6 |
"eval_steps": 300,
|
| 7 |
+
"global_step": 2700,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 2564 |
"eval_samples_per_second": 2.301,
|
| 2565 |
"eval_steps_per_second": 0.575,
|
| 2566 |
"step": 2400
|
| 2567 |
+
},
|
| 2568 |
+
{
|
| 2569 |
+
"entropy": 0.2655953477136791,
|
| 2570 |
+
"epoch": 3.856,
|
| 2571 |
+
"grad_norm": 0.8277497291564941,
|
| 2572 |
+
"learning_rate": 2.2976e-05,
|
| 2573 |
+
"loss": 0.2109,
|
| 2574 |
+
"mean_token_accuracy": 0.9393812574446201,
|
| 2575 |
+
"num_tokens": 1011268.0,
|
| 2576 |
+
"step": 2410
|
| 2577 |
+
},
|
| 2578 |
+
{
|
| 2579 |
+
"entropy": 0.2920296056661755,
|
| 2580 |
+
"epoch": 3.872,
|
| 2581 |
+
"grad_norm": 1.015434980392456,
|
| 2582 |
+
"learning_rate": 2.2656e-05,
|
| 2583 |
+
"loss": 0.2243,
|
| 2584 |
+
"mean_token_accuracy": 0.9357186656445264,
|
| 2585 |
+
"num_tokens": 1026942.0,
|
| 2586 |
+
"step": 2420
|
| 2587 |
+
},
|
| 2588 |
+
{
|
| 2589 |
+
"entropy": 0.2859017666429281,
|
| 2590 |
+
"epoch": 3.888,
|
| 2591 |
+
"grad_norm": 0.6656726002693176,
|
| 2592 |
+
"learning_rate": 2.2336e-05,
|
| 2593 |
+
"loss": 0.2389,
|
| 2594 |
+
"mean_token_accuracy": 0.9283736657351256,
|
| 2595 |
+
"num_tokens": 1053937.0,
|
| 2596 |
+
"step": 2430
|
| 2597 |
+
},
|
| 2598 |
+
{
|
| 2599 |
+
"entropy": 0.24961302392184734,
|
| 2600 |
+
"epoch": 3.904,
|
| 2601 |
+
"grad_norm": 0.8390278816223145,
|
| 2602 |
+
"learning_rate": 2.2016e-05,
|
| 2603 |
+
"loss": 0.2211,
|
| 2604 |
+
"mean_token_accuracy": 0.9312011521309614,
|
| 2605 |
+
"num_tokens": 1084820.0,
|
| 2606 |
+
"step": 2440
|
| 2607 |
+
},
|
| 2608 |
+
{
|
| 2609 |
+
"entropy": 0.2519187033176422,
|
| 2610 |
+
"epoch": 3.92,
|
| 2611 |
+
"grad_norm": 0.8542287349700928,
|
| 2612 |
+
"learning_rate": 2.1696e-05,
|
| 2613 |
+
"loss": 0.2126,
|
| 2614 |
+
"mean_token_accuracy": 0.9375488836318254,
|
| 2615 |
+
"num_tokens": 1109943.0,
|
| 2616 |
+
"step": 2450
|
| 2617 |
+
},
|
| 2618 |
+
{
|
| 2619 |
+
"entropy": 0.27277124775573613,
|
| 2620 |
+
"epoch": 3.936,
|
| 2621 |
+
"grad_norm": 0.9245595335960388,
|
| 2622 |
+
"learning_rate": 2.1376e-05,
|
| 2623 |
+
"loss": 0.2161,
|
| 2624 |
+
"mean_token_accuracy": 0.9364014331251382,
|
| 2625 |
+
"num_tokens": 1130543.0,
|
| 2626 |
+
"step": 2460
|
| 2627 |
+
},
|
| 2628 |
+
{
|
| 2629 |
+
"entropy": 0.28273853762075307,
|
| 2630 |
+
"epoch": 3.952,
|
| 2631 |
+
"grad_norm": 0.9764724969863892,
|
| 2632 |
+
"learning_rate": 2.1056e-05,
|
| 2633 |
+
"loss": 0.2217,
|
| 2634 |
+
"mean_token_accuracy": 0.9356040749698877,
|
| 2635 |
+
"num_tokens": 1146676.0,
|
| 2636 |
+
"step": 2470
|
| 2637 |
+
},
|
| 2638 |
+
{
|
| 2639 |
+
"entropy": 0.2879827093333006,
|
| 2640 |
+
"epoch": 3.968,
|
| 2641 |
+
"grad_norm": 0.7532303929328918,
|
| 2642 |
+
"learning_rate": 2.0736e-05,
|
| 2643 |
+
"loss": 0.2413,
|
| 2644 |
+
"mean_token_accuracy": 0.9290374431759119,
|
| 2645 |
+
"num_tokens": 1172078.0,
|
| 2646 |
+
"step": 2480
|
| 2647 |
+
},
|
| 2648 |
+
{
|
| 2649 |
+
"entropy": 0.2530561724677682,
|
| 2650 |
+
"epoch": 3.984,
|
| 2651 |
+
"grad_norm": 0.8568546175956726,
|
| 2652 |
+
"learning_rate": 2.0416000000000002e-05,
|
| 2653 |
+
"loss": 0.2177,
|
| 2654 |
+
"mean_token_accuracy": 0.9337470591068268,
|
| 2655 |
+
"num_tokens": 1197464.0,
|
| 2656 |
+
"step": 2490
|
| 2657 |
+
},
|
| 2658 |
+
{
|
| 2659 |
+
"entropy": 0.3038310568779707,
|
| 2660 |
+
"epoch": 4.0,
|
| 2661 |
+
"grad_norm": 0.9622617959976196,
|
| 2662 |
+
"learning_rate": 2.0096000000000002e-05,
|
| 2663 |
+
"loss": 0.2368,
|
| 2664 |
+
"mean_token_accuracy": 0.9296225290745497,
|
| 2665 |
+
"num_tokens": 1212204.0,
|
| 2666 |
+
"step": 2500
|
| 2667 |
+
},
|
| 2668 |
+
{
|
| 2669 |
+
"entropy": 0.24809251818805933,
|
| 2670 |
+
"epoch": 4.016,
|
| 2671 |
+
"grad_norm": 0.8197008371353149,
|
| 2672 |
+
"learning_rate": 1.9776000000000002e-05,
|
| 2673 |
+
"loss": 0.2395,
|
| 2674 |
+
"mean_token_accuracy": 0.928604032099247,
|
| 2675 |
+
"num_tokens": 1253458.0,
|
| 2676 |
+
"step": 2510
|
| 2677 |
+
},
|
| 2678 |
+
{
|
| 2679 |
+
"entropy": 0.24905966678634286,
|
| 2680 |
+
"epoch": 4.032,
|
| 2681 |
+
"grad_norm": 0.8056384921073914,
|
| 2682 |
+
"learning_rate": 1.9456e-05,
|
| 2683 |
+
"loss": 0.2301,
|
| 2684 |
+
"mean_token_accuracy": 0.9330911111086607,
|
| 2685 |
+
"num_tokens": 1282365.0,
|
| 2686 |
+
"step": 2520
|
| 2687 |
+
},
|
| 2688 |
+
{
|
| 2689 |
+
"entropy": 0.26601817598566413,
|
| 2690 |
+
"epoch": 4.048,
|
| 2691 |
+
"grad_norm": 0.9766417145729065,
|
| 2692 |
+
"learning_rate": 1.9136e-05,
|
| 2693 |
+
"loss": 0.2237,
|
| 2694 |
+
"mean_token_accuracy": 0.9384452097117901,
|
| 2695 |
+
"num_tokens": 1305420.0,
|
| 2696 |
+
"step": 2530
|
| 2697 |
+
},
|
| 2698 |
+
{
|
| 2699 |
+
"entropy": 0.28673125999048354,
|
| 2700 |
+
"epoch": 4.064,
|
| 2701 |
+
"grad_norm": 1.2241604328155518,
|
| 2702 |
+
"learning_rate": 1.8816e-05,
|
| 2703 |
+
"loss": 0.2615,
|
| 2704 |
+
"mean_token_accuracy": 0.9268214203417301,
|
| 2705 |
+
"num_tokens": 1323367.0,
|
| 2706 |
+
"step": 2540
|
| 2707 |
+
},
|
| 2708 |
+
{
|
| 2709 |
+
"entropy": 0.3297149523161352,
|
| 2710 |
+
"epoch": 4.08,
|
| 2711 |
+
"grad_norm": 1.2444630861282349,
|
| 2712 |
+
"learning_rate": 1.8496000000000004e-05,
|
| 2713 |
+
"loss": 0.266,
|
| 2714 |
+
"mean_token_accuracy": 0.9285014558583498,
|
| 2715 |
+
"num_tokens": 1335370.0,
|
| 2716 |
+
"step": 2550
|
| 2717 |
+
},
|
| 2718 |
+
{
|
| 2719 |
+
"entropy": 0.25180468857288363,
|
| 2720 |
+
"epoch": 4.096,
|
| 2721 |
+
"grad_norm": 0.6901214718818665,
|
| 2722 |
+
"learning_rate": 1.8176e-05,
|
| 2723 |
+
"loss": 0.2242,
|
| 2724 |
+
"mean_token_accuracy": 0.9317782554775477,
|
| 2725 |
+
"num_tokens": 1374567.0,
|
| 2726 |
+
"step": 2560
|
| 2727 |
+
},
|
| 2728 |
+
{
|
| 2729 |
+
"entropy": 0.25819407450035214,
|
| 2730 |
+
"epoch": 4.112,
|
| 2731 |
+
"grad_norm": 0.8702373504638672,
|
| 2732 |
+
"learning_rate": 1.7856e-05,
|
| 2733 |
+
"loss": 0.2344,
|
| 2734 |
+
"mean_token_accuracy": 0.9326971143484115,
|
| 2735 |
+
"num_tokens": 1402608.0,
|
| 2736 |
+
"step": 2570
|
| 2737 |
+
},
|
| 2738 |
+
{
|
| 2739 |
+
"entropy": 0.26549670435488226,
|
| 2740 |
+
"epoch": 4.128,
|
| 2741 |
+
"grad_norm": 0.7631207704544067,
|
| 2742 |
+
"learning_rate": 1.7536e-05,
|
| 2743 |
+
"loss": 0.2297,
|
| 2744 |
+
"mean_token_accuracy": 0.9365796335041523,
|
| 2745 |
+
"num_tokens": 1425524.0,
|
| 2746 |
+
"step": 2580
|
| 2747 |
+
},
|
| 2748 |
+
{
|
| 2749 |
+
"entropy": 0.26975566176697613,
|
| 2750 |
+
"epoch": 4.144,
|
| 2751 |
+
"grad_norm": 1.1718668937683105,
|
| 2752 |
+
"learning_rate": 1.7216000000000003e-05,
|
| 2753 |
+
"loss": 0.221,
|
| 2754 |
+
"mean_token_accuracy": 0.9397962510585784,
|
| 2755 |
+
"num_tokens": 1444092.0,
|
| 2756 |
+
"step": 2590
|
| 2757 |
+
},
|
| 2758 |
+
{
|
| 2759 |
+
"entropy": 0.3168819394893944,
|
| 2760 |
+
"epoch": 4.16,
|
| 2761 |
+
"grad_norm": 1.0534077882766724,
|
| 2762 |
+
"learning_rate": 1.6896000000000002e-05,
|
| 2763 |
+
"loss": 0.2544,
|
| 2764 |
+
"mean_token_accuracy": 0.9319371480494738,
|
| 2765 |
+
"num_tokens": 1456844.0,
|
| 2766 |
+
"step": 2600
|
| 2767 |
+
},
|
| 2768 |
+
{
|
| 2769 |
+
"entropy": 0.25265237540006635,
|
| 2770 |
+
"epoch": 4.176,
|
| 2771 |
+
"grad_norm": 0.7592364549636841,
|
| 2772 |
+
"learning_rate": 1.6576e-05,
|
| 2773 |
+
"loss": 0.2395,
|
| 2774 |
+
"mean_token_accuracy": 0.9289916418492794,
|
| 2775 |
+
"num_tokens": 1496545.0,
|
| 2776 |
+
"step": 2610
|
| 2777 |
+
},
|
| 2778 |
+
{
|
| 2779 |
+
"entropy": 0.2543726827017963,
|
| 2780 |
+
"epoch": 4.192,
|
| 2781 |
+
"grad_norm": 0.9639586210250854,
|
| 2782 |
+
"learning_rate": 1.6256e-05,
|
| 2783 |
+
"loss": 0.2351,
|
| 2784 |
+
"mean_token_accuracy": 0.9337568439543247,
|
| 2785 |
+
"num_tokens": 1525103.0,
|
| 2786 |
+
"step": 2620
|
| 2787 |
+
},
|
| 2788 |
+
{
|
| 2789 |
+
"entropy": 0.26547051025554536,
|
| 2790 |
+
"epoch": 4.208,
|
| 2791 |
+
"grad_norm": 0.9620559215545654,
|
| 2792 |
+
"learning_rate": 1.5936e-05,
|
| 2793 |
+
"loss": 0.2382,
|
| 2794 |
+
"mean_token_accuracy": 0.9348125293850899,
|
| 2795 |
+
"num_tokens": 1548306.0,
|
| 2796 |
+
"step": 2630
|
| 2797 |
+
},
|
| 2798 |
+
{
|
| 2799 |
+
"entropy": 0.27369030360132457,
|
| 2800 |
+
"epoch": 4.224,
|
| 2801 |
+
"grad_norm": 0.8373218774795532,
|
| 2802 |
+
"learning_rate": 1.5616e-05,
|
| 2803 |
+
"loss": 0.2254,
|
| 2804 |
+
"mean_token_accuracy": 0.9375662509351969,
|
| 2805 |
+
"num_tokens": 1566990.0,
|
| 2806 |
+
"step": 2640
|
| 2807 |
+
},
|
| 2808 |
+
{
|
| 2809 |
+
"entropy": 0.3024815677665174,
|
| 2810 |
+
"epoch": 4.24,
|
| 2811 |
+
"grad_norm": 1.3148176670074463,
|
| 2812 |
+
"learning_rate": 1.5296e-05,
|
| 2813 |
+
"loss": 0.2391,
|
| 2814 |
+
"mean_token_accuracy": 0.9351990919560194,
|
| 2815 |
+
"num_tokens": 1580065.0,
|
| 2816 |
+
"step": 2650
|
| 2817 |
+
},
|
| 2818 |
+
{
|
| 2819 |
+
"entropy": 0.2600595161318779,
|
| 2820 |
+
"epoch": 4.256,
|
| 2821 |
+
"grad_norm": 0.6774656176567078,
|
| 2822 |
+
"learning_rate": 1.4976000000000002e-05,
|
| 2823 |
+
"loss": 0.2377,
|
| 2824 |
+
"mean_token_accuracy": 0.9274554952979088,
|
| 2825 |
+
"num_tokens": 1619083.0,
|
| 2826 |
+
"step": 2660
|
| 2827 |
+
},
|
| 2828 |
+
{
|
| 2829 |
+
"entropy": 0.26013899641111493,
|
| 2830 |
+
"epoch": 4.272,
|
| 2831 |
+
"grad_norm": 0.9727310538291931,
|
| 2832 |
+
"learning_rate": 1.4656e-05,
|
| 2833 |
+
"loss": 0.2294,
|
| 2834 |
+
"mean_token_accuracy": 0.934112536534667,
|
| 2835 |
+
"num_tokens": 1646970.0,
|
| 2836 |
+
"step": 2670
|
| 2837 |
+
},
|
| 2838 |
+
{
|
| 2839 |
+
"entropy": 0.25867203902453184,
|
| 2840 |
+
"epoch": 4.288,
|
| 2841 |
+
"grad_norm": 0.9198706150054932,
|
| 2842 |
+
"learning_rate": 1.4336e-05,
|
| 2843 |
+
"loss": 0.2184,
|
| 2844 |
+
"mean_token_accuracy": 0.9373745564371347,
|
| 2845 |
+
"num_tokens": 1669364.0,
|
| 2846 |
+
"step": 2680
|
| 2847 |
+
},
|
| 2848 |
+
{
|
| 2849 |
+
"entropy": 0.26432402124628424,
|
| 2850 |
+
"epoch": 4.304,
|
| 2851 |
+
"grad_norm": 0.9908862709999084,
|
| 2852 |
+
"learning_rate": 1.4016000000000001e-05,
|
| 2853 |
+
"loss": 0.2195,
|
| 2854 |
+
"mean_token_accuracy": 0.9392576098442078,
|
| 2855 |
+
"num_tokens": 1687812.0,
|
| 2856 |
+
"step": 2690
|
| 2857 |
+
},
|
| 2858 |
+
{
|
| 2859 |
+
"entropy": 0.30741472546942533,
|
| 2860 |
+
"epoch": 4.32,
|
| 2861 |
+
"grad_norm": 1.0388495922088623,
|
| 2862 |
+
"learning_rate": 1.3696e-05,
|
| 2863 |
+
"loss": 0.2503,
|
| 2864 |
+
"mean_token_accuracy": 0.9325483400374651,
|
| 2865 |
+
"num_tokens": 1700598.0,
|
| 2866 |
+
"step": 2700
|
| 2867 |
+
},
|
| 2868 |
+
{
|
| 2869 |
+
"epoch": 4.32,
|
| 2870 |
+
"eval_accuracy": 0.02638358121882313,
|
| 2871 |
+
"eval_entropy": 0.3719751555919647,
|
| 2872 |
+
"eval_loss": 0.5846644043922424,
|
| 2873 |
+
"eval_mean_token_accuracy": 0.8568292667865753,
|
| 2874 |
+
"eval_num_tokens": 1700598.0,
|
| 2875 |
+
"eval_runtime": 869.8497,
|
| 2876 |
+
"eval_samples_per_second": 2.299,
|
| 2877 |
+
"eval_steps_per_second": 0.575,
|
| 2878 |
+
"step": 2700
|
| 2879 |
}
|
| 2880 |
],
|
| 2881 |
"logging_steps": 10,
|
|
|
|
| 2895 |
"attributes": {}
|
| 2896 |
}
|
| 2897 |
},
|
| 2898 |
+
"total_flos": 4.639214588564275e+17,
|
| 2899 |
"train_batch_size": 1,
|
| 2900 |
"trial_name": null,
|
| 2901 |
"trial_params": null
|