Training checkpoint at step 8000
Browse files- trainer_state.json +366 -6
trainer_state.json
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
{
|
| 2 |
-
"best_global_step":
|
| 3 |
-
"best_metric": 2.
|
| 4 |
-
"best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 100,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -2528,6 +2528,366 @@
|
|
| 2528 |
"eval_samples_per_second": 3.212,
|
| 2529 |
"eval_steps_per_second": 1.606,
|
| 2530 |
"step": 7000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2531 |
}
|
| 2532 |
],
|
| 2533 |
"logging_steps": 25,
|
|
@@ -2547,7 +2907,7 @@
|
|
| 2547 |
"attributes": {}
|
| 2548 |
}
|
| 2549 |
},
|
| 2550 |
-
"total_flos": 2.
|
| 2551 |
"train_batch_size": 1,
|
| 2552 |
"trial_name": null,
|
| 2553 |
"trial_params": null
|
|
|
|
| 1 |
{
|
| 2 |
+
"best_global_step": 8000,
|
| 3 |
+
"best_metric": 2.4125914573669434,
|
| 4 |
+
"best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-8000",
|
| 5 |
+
"epoch": 0.16,
|
| 6 |
"eval_steps": 100,
|
| 7 |
+
"global_step": 8000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 2528 |
"eval_samples_per_second": 3.212,
|
| 2529 |
"eval_steps_per_second": 1.606,
|
| 2530 |
"step": 7000
|
| 2531 |
+
},
|
| 2532 |
+
{
|
| 2533 |
+
"epoch": 0.1405,
|
| 2534 |
+
"grad_norm": 0.5975652527083385,
|
| 2535 |
+
"learning_rate": 9.550222222222223e-06,
|
| 2536 |
+
"loss": 2.398,
|
| 2537 |
+
"step": 7025
|
| 2538 |
+
},
|
| 2539 |
+
{
|
| 2540 |
+
"epoch": 0.141,
|
| 2541 |
+
"grad_norm": 0.5642285559875744,
|
| 2542 |
+
"learning_rate": 9.544666666666667e-06,
|
| 2543 |
+
"loss": 2.3907,
|
| 2544 |
+
"step": 7050
|
| 2545 |
+
},
|
| 2546 |
+
{
|
| 2547 |
+
"epoch": 0.1415,
|
| 2548 |
+
"grad_norm": 0.5977243463765347,
|
| 2549 |
+
"learning_rate": 9.539111111111112e-06,
|
| 2550 |
+
"loss": 2.4063,
|
| 2551 |
+
"step": 7075
|
| 2552 |
+
},
|
| 2553 |
+
{
|
| 2554 |
+
"epoch": 0.142,
|
| 2555 |
+
"grad_norm": 0.5938091922766982,
|
| 2556 |
+
"learning_rate": 9.533555555555556e-06,
|
| 2557 |
+
"loss": 2.4064,
|
| 2558 |
+
"step": 7100
|
| 2559 |
+
},
|
| 2560 |
+
{
|
| 2561 |
+
"epoch": 0.142,
|
| 2562 |
+
"eval_loss": 2.4153244495391846,
|
| 2563 |
+
"eval_runtime": 31.6856,
|
| 2564 |
+
"eval_samples_per_second": 3.219,
|
| 2565 |
+
"eval_steps_per_second": 1.61,
|
| 2566 |
+
"step": 7100
|
| 2567 |
+
},
|
| 2568 |
+
{
|
| 2569 |
+
"epoch": 0.1425,
|
| 2570 |
+
"grad_norm": 0.6203811817044198,
|
| 2571 |
+
"learning_rate": 9.528000000000001e-06,
|
| 2572 |
+
"loss": 2.3995,
|
| 2573 |
+
"step": 7125
|
| 2574 |
+
},
|
| 2575 |
+
{
|
| 2576 |
+
"epoch": 0.143,
|
| 2577 |
+
"grad_norm": 0.5748373728564159,
|
| 2578 |
+
"learning_rate": 9.522444444444444e-06,
|
| 2579 |
+
"loss": 2.4052,
|
| 2580 |
+
"step": 7150
|
| 2581 |
+
},
|
| 2582 |
+
{
|
| 2583 |
+
"epoch": 0.1435,
|
| 2584 |
+
"grad_norm": 0.6318360721408016,
|
| 2585 |
+
"learning_rate": 9.51688888888889e-06,
|
| 2586 |
+
"loss": 2.396,
|
| 2587 |
+
"step": 7175
|
| 2588 |
+
},
|
| 2589 |
+
{
|
| 2590 |
+
"epoch": 0.144,
|
| 2591 |
+
"grad_norm": 0.5777480191110791,
|
| 2592 |
+
"learning_rate": 9.511333333333335e-06,
|
| 2593 |
+
"loss": 2.3966,
|
| 2594 |
+
"step": 7200
|
| 2595 |
+
},
|
| 2596 |
+
{
|
| 2597 |
+
"epoch": 0.144,
|
| 2598 |
+
"eval_loss": 2.414691209793091,
|
| 2599 |
+
"eval_runtime": 31.5495,
|
| 2600 |
+
"eval_samples_per_second": 3.233,
|
| 2601 |
+
"eval_steps_per_second": 1.617,
|
| 2602 |
+
"step": 7200
|
| 2603 |
+
},
|
| 2604 |
+
{
|
| 2605 |
+
"epoch": 0.1445,
|
| 2606 |
+
"grad_norm": 0.5896122820881663,
|
| 2607 |
+
"learning_rate": 9.505777777777779e-06,
|
| 2608 |
+
"loss": 2.4018,
|
| 2609 |
+
"step": 7225
|
| 2610 |
+
},
|
| 2611 |
+
{
|
| 2612 |
+
"epoch": 0.145,
|
| 2613 |
+
"grad_norm": 0.6081675838061575,
|
| 2614 |
+
"learning_rate": 9.500222222222222e-06,
|
| 2615 |
+
"loss": 2.4036,
|
| 2616 |
+
"step": 7250
|
| 2617 |
+
},
|
| 2618 |
+
{
|
| 2619 |
+
"epoch": 0.1455,
|
| 2620 |
+
"grad_norm": 0.6032973832585987,
|
| 2621 |
+
"learning_rate": 9.494666666666667e-06,
|
| 2622 |
+
"loss": 2.4025,
|
| 2623 |
+
"step": 7275
|
| 2624 |
+
},
|
| 2625 |
+
{
|
| 2626 |
+
"epoch": 0.146,
|
| 2627 |
+
"grad_norm": 0.6283775464354142,
|
| 2628 |
+
"learning_rate": 9.489111111111113e-06,
|
| 2629 |
+
"loss": 2.4078,
|
| 2630 |
+
"step": 7300
|
| 2631 |
+
},
|
| 2632 |
+
{
|
| 2633 |
+
"epoch": 0.146,
|
| 2634 |
+
"eval_loss": 2.4143505096435547,
|
| 2635 |
+
"eval_runtime": 31.4643,
|
| 2636 |
+
"eval_samples_per_second": 3.242,
|
| 2637 |
+
"eval_steps_per_second": 1.621,
|
| 2638 |
+
"step": 7300
|
| 2639 |
+
},
|
| 2640 |
+
{
|
| 2641 |
+
"epoch": 0.1465,
|
| 2642 |
+
"grad_norm": 0.5969038728051346,
|
| 2643 |
+
"learning_rate": 9.483555555555556e-06,
|
| 2644 |
+
"loss": 2.4066,
|
| 2645 |
+
"step": 7325
|
| 2646 |
+
},
|
| 2647 |
+
{
|
| 2648 |
+
"epoch": 0.147,
|
| 2649 |
+
"grad_norm": 0.6048317665387537,
|
| 2650 |
+
"learning_rate": 9.478e-06,
|
| 2651 |
+
"loss": 2.4007,
|
| 2652 |
+
"step": 7350
|
| 2653 |
+
},
|
| 2654 |
+
{
|
| 2655 |
+
"epoch": 0.1475,
|
| 2656 |
+
"grad_norm": 0.5721050600021237,
|
| 2657 |
+
"learning_rate": 9.472444444444445e-06,
|
| 2658 |
+
"loss": 2.4146,
|
| 2659 |
+
"step": 7375
|
| 2660 |
+
},
|
| 2661 |
+
{
|
| 2662 |
+
"epoch": 0.148,
|
| 2663 |
+
"grad_norm": 0.6019256818391423,
|
| 2664 |
+
"learning_rate": 9.46688888888889e-06,
|
| 2665 |
+
"loss": 2.399,
|
| 2666 |
+
"step": 7400
|
| 2667 |
+
},
|
| 2668 |
+
{
|
| 2669 |
+
"epoch": 0.148,
|
| 2670 |
+
"eval_loss": 2.414281129837036,
|
| 2671 |
+
"eval_runtime": 31.7034,
|
| 2672 |
+
"eval_samples_per_second": 3.217,
|
| 2673 |
+
"eval_steps_per_second": 1.609,
|
| 2674 |
+
"step": 7400
|
| 2675 |
+
},
|
| 2676 |
+
{
|
| 2677 |
+
"epoch": 0.1485,
|
| 2678 |
+
"grad_norm": 0.6386043502919573,
|
| 2679 |
+
"learning_rate": 9.461333333333334e-06,
|
| 2680 |
+
"loss": 2.3957,
|
| 2681 |
+
"step": 7425
|
| 2682 |
+
},
|
| 2683 |
+
{
|
| 2684 |
+
"epoch": 0.149,
|
| 2685 |
+
"grad_norm": 0.5819226766027404,
|
| 2686 |
+
"learning_rate": 9.455777777777777e-06,
|
| 2687 |
+
"loss": 2.4001,
|
| 2688 |
+
"step": 7450
|
| 2689 |
+
},
|
| 2690 |
+
{
|
| 2691 |
+
"epoch": 0.1495,
|
| 2692 |
+
"grad_norm": 0.6372396676223023,
|
| 2693 |
+
"learning_rate": 9.450222222222223e-06,
|
| 2694 |
+
"loss": 2.3976,
|
| 2695 |
+
"step": 7475
|
| 2696 |
+
},
|
| 2697 |
+
{
|
| 2698 |
+
"epoch": 0.15,
|
| 2699 |
+
"grad_norm": 0.5888017578283452,
|
| 2700 |
+
"learning_rate": 9.444666666666668e-06,
|
| 2701 |
+
"loss": 2.4008,
|
| 2702 |
+
"step": 7500
|
| 2703 |
+
},
|
| 2704 |
+
{
|
| 2705 |
+
"epoch": 0.15,
|
| 2706 |
+
"eval_loss": 2.414154291152954,
|
| 2707 |
+
"eval_runtime": 31.8152,
|
| 2708 |
+
"eval_samples_per_second": 3.206,
|
| 2709 |
+
"eval_steps_per_second": 1.603,
|
| 2710 |
+
"step": 7500
|
| 2711 |
+
},
|
| 2712 |
+
{
|
| 2713 |
+
"epoch": 0.1505,
|
| 2714 |
+
"grad_norm": 0.6132781564549638,
|
| 2715 |
+
"learning_rate": 9.439111111111111e-06,
|
| 2716 |
+
"loss": 2.4077,
|
| 2717 |
+
"step": 7525
|
| 2718 |
+
},
|
| 2719 |
+
{
|
| 2720 |
+
"epoch": 0.151,
|
| 2721 |
+
"grad_norm": 0.6063002641957036,
|
| 2722 |
+
"learning_rate": 9.433555555555557e-06,
|
| 2723 |
+
"loss": 2.3889,
|
| 2724 |
+
"step": 7550
|
| 2725 |
+
},
|
| 2726 |
+
{
|
| 2727 |
+
"epoch": 0.1515,
|
| 2728 |
+
"grad_norm": 0.614169638364484,
|
| 2729 |
+
"learning_rate": 9.428e-06,
|
| 2730 |
+
"loss": 2.4121,
|
| 2731 |
+
"step": 7575
|
| 2732 |
+
},
|
| 2733 |
+
{
|
| 2734 |
+
"epoch": 0.152,
|
| 2735 |
+
"grad_norm": 0.5826866596297434,
|
| 2736 |
+
"learning_rate": 9.422444444444445e-06,
|
| 2737 |
+
"loss": 2.4075,
|
| 2738 |
+
"step": 7600
|
| 2739 |
+
},
|
| 2740 |
+
{
|
| 2741 |
+
"epoch": 0.152,
|
| 2742 |
+
"eval_loss": 2.414039134979248,
|
| 2743 |
+
"eval_runtime": 31.7985,
|
| 2744 |
+
"eval_samples_per_second": 3.208,
|
| 2745 |
+
"eval_steps_per_second": 1.604,
|
| 2746 |
+
"step": 7600
|
| 2747 |
+
},
|
| 2748 |
+
{
|
| 2749 |
+
"epoch": 0.1525,
|
| 2750 |
+
"grad_norm": 0.5964985955677213,
|
| 2751 |
+
"learning_rate": 9.41688888888889e-06,
|
| 2752 |
+
"loss": 2.3976,
|
| 2753 |
+
"step": 7625
|
| 2754 |
+
},
|
| 2755 |
+
{
|
| 2756 |
+
"epoch": 0.153,
|
| 2757 |
+
"grad_norm": 0.5946671745059025,
|
| 2758 |
+
"learning_rate": 9.411333333333334e-06,
|
| 2759 |
+
"loss": 2.3947,
|
| 2760 |
+
"step": 7650
|
| 2761 |
+
},
|
| 2762 |
+
{
|
| 2763 |
+
"epoch": 0.1535,
|
| 2764 |
+
"grad_norm": 0.5894909865358033,
|
| 2765 |
+
"learning_rate": 9.405777777777778e-06,
|
| 2766 |
+
"loss": 2.4079,
|
| 2767 |
+
"step": 7675
|
| 2768 |
+
},
|
| 2769 |
+
{
|
| 2770 |
+
"epoch": 0.154,
|
| 2771 |
+
"grad_norm": 0.6048420481174572,
|
| 2772 |
+
"learning_rate": 9.400222222222223e-06,
|
| 2773 |
+
"loss": 2.4015,
|
| 2774 |
+
"step": 7700
|
| 2775 |
+
},
|
| 2776 |
+
{
|
| 2777 |
+
"epoch": 0.154,
|
| 2778 |
+
"eval_loss": 2.413475275039673,
|
| 2779 |
+
"eval_runtime": 31.9136,
|
| 2780 |
+
"eval_samples_per_second": 3.196,
|
| 2781 |
+
"eval_steps_per_second": 1.598,
|
| 2782 |
+
"step": 7700
|
| 2783 |
+
},
|
| 2784 |
+
{
|
| 2785 |
+
"epoch": 0.1545,
|
| 2786 |
+
"grad_norm": 0.617559481688582,
|
| 2787 |
+
"learning_rate": 9.394666666666668e-06,
|
| 2788 |
+
"loss": 2.4036,
|
| 2789 |
+
"step": 7725
|
| 2790 |
+
},
|
| 2791 |
+
{
|
| 2792 |
+
"epoch": 0.155,
|
| 2793 |
+
"grad_norm": 0.6350332331451685,
|
| 2794 |
+
"learning_rate": 9.389111111111112e-06,
|
| 2795 |
+
"loss": 2.3989,
|
| 2796 |
+
"step": 7750
|
| 2797 |
+
},
|
| 2798 |
+
{
|
| 2799 |
+
"epoch": 0.1555,
|
| 2800 |
+
"grad_norm": 0.6034892604414784,
|
| 2801 |
+
"learning_rate": 9.383555555555557e-06,
|
| 2802 |
+
"loss": 2.398,
|
| 2803 |
+
"step": 7775
|
| 2804 |
+
},
|
| 2805 |
+
{
|
| 2806 |
+
"epoch": 0.156,
|
| 2807 |
+
"grad_norm": 0.5879016941841427,
|
| 2808 |
+
"learning_rate": 9.378e-06,
|
| 2809 |
+
"loss": 2.3989,
|
| 2810 |
+
"step": 7800
|
| 2811 |
+
},
|
| 2812 |
+
{
|
| 2813 |
+
"epoch": 0.156,
|
| 2814 |
+
"eval_loss": 2.4134128093719482,
|
| 2815 |
+
"eval_runtime": 31.7809,
|
| 2816 |
+
"eval_samples_per_second": 3.209,
|
| 2817 |
+
"eval_steps_per_second": 1.605,
|
| 2818 |
+
"step": 7800
|
| 2819 |
+
},
|
| 2820 |
+
{
|
| 2821 |
+
"epoch": 0.1565,
|
| 2822 |
+
"grad_norm": 0.5957060592966067,
|
| 2823 |
+
"learning_rate": 9.372444444444446e-06,
|
| 2824 |
+
"loss": 2.3951,
|
| 2825 |
+
"step": 7825
|
| 2826 |
+
},
|
| 2827 |
+
{
|
| 2828 |
+
"epoch": 0.157,
|
| 2829 |
+
"grad_norm": 0.6127788552445546,
|
| 2830 |
+
"learning_rate": 9.36688888888889e-06,
|
| 2831 |
+
"loss": 2.3966,
|
| 2832 |
+
"step": 7850
|
| 2833 |
+
},
|
| 2834 |
+
{
|
| 2835 |
+
"epoch": 0.1575,
|
| 2836 |
+
"grad_norm": 0.6103495429829666,
|
| 2837 |
+
"learning_rate": 9.361333333333335e-06,
|
| 2838 |
+
"loss": 2.3974,
|
| 2839 |
+
"step": 7875
|
| 2840 |
+
},
|
| 2841 |
+
{
|
| 2842 |
+
"epoch": 0.158,
|
| 2843 |
+
"grad_norm": 0.5940303847498369,
|
| 2844 |
+
"learning_rate": 9.355777777777778e-06,
|
| 2845 |
+
"loss": 2.3982,
|
| 2846 |
+
"step": 7900
|
| 2847 |
+
},
|
| 2848 |
+
{
|
| 2849 |
+
"epoch": 0.158,
|
| 2850 |
+
"eval_loss": 2.4130520820617676,
|
| 2851 |
+
"eval_runtime": 31.8718,
|
| 2852 |
+
"eval_samples_per_second": 3.2,
|
| 2853 |
+
"eval_steps_per_second": 1.6,
|
| 2854 |
+
"step": 7900
|
| 2855 |
+
},
|
| 2856 |
+
{
|
| 2857 |
+
"epoch": 0.1585,
|
| 2858 |
+
"grad_norm": 0.5967208318826438,
|
| 2859 |
+
"learning_rate": 9.350222222222224e-06,
|
| 2860 |
+
"loss": 2.3963,
|
| 2861 |
+
"step": 7925
|
| 2862 |
+
},
|
| 2863 |
+
{
|
| 2864 |
+
"epoch": 0.159,
|
| 2865 |
+
"grad_norm": 0.6074697420049116,
|
| 2866 |
+
"learning_rate": 9.344666666666667e-06,
|
| 2867 |
+
"loss": 2.4004,
|
| 2868 |
+
"step": 7950
|
| 2869 |
+
},
|
| 2870 |
+
{
|
| 2871 |
+
"epoch": 0.1595,
|
| 2872 |
+
"grad_norm": 0.6007548308453654,
|
| 2873 |
+
"learning_rate": 9.339111111111112e-06,
|
| 2874 |
+
"loss": 2.3972,
|
| 2875 |
+
"step": 7975
|
| 2876 |
+
},
|
| 2877 |
+
{
|
| 2878 |
+
"epoch": 0.16,
|
| 2879 |
+
"grad_norm": 0.6058573477149505,
|
| 2880 |
+
"learning_rate": 9.333555555555558e-06,
|
| 2881 |
+
"loss": 2.4,
|
| 2882 |
+
"step": 8000
|
| 2883 |
+
},
|
| 2884 |
+
{
|
| 2885 |
+
"epoch": 0.16,
|
| 2886 |
+
"eval_loss": 2.4125914573669434,
|
| 2887 |
+
"eval_runtime": 31.8819,
|
| 2888 |
+
"eval_samples_per_second": 3.199,
|
| 2889 |
+
"eval_steps_per_second": 1.6,
|
| 2890 |
+
"step": 8000
|
| 2891 |
}
|
| 2892 |
],
|
| 2893 |
"logging_steps": 25,
|
|
|
|
| 2907 |
"attributes": {}
|
| 2908 |
}
|
| 2909 |
},
|
| 2910 |
+
"total_flos": 2.546561838661763e+19,
|
| 2911 |
"train_batch_size": 1,
|
| 2912 |
"trial_name": null,
|
| 2913 |
"trial_params": null
|